diff --git a/.github/PULL_REQUEST_AUTOMATIC_TEMPLATE.md b/.github/PULL_REQUEST_AUTOMATIC_TEMPLATE.md
new file mode 100644
index 000000000000..1f3e7b69e2e0
--- /dev/null
+++ b/.github/PULL_REQUEST_AUTOMATIC_TEMPLATE.md
@@ -0,0 +1,6 @@
+:auto_rickshaw: This PR should be merged automatically once it has been approved. If it doesn't happen:
+- [ ] Handle merge conflicts
+- [ ] Fix build errors
+
+
+:bulb: It has been opened automatically after changes were merged in a feature branch.
diff --git a/.github/workflows/ci-docs-tests.yml b/.github/workflows/ci-docs-tests.yml
index 65a68c861c84..3963b5eb36ee 100644
--- a/.github/workflows/ci-docs-tests.yml
+++ b/.github/workflows/ci-docs-tests.yml
@@ -41,7 +41,7 @@ jobs:
         python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
 
     - name: Set up Node 12.x 🦙
-      uses: actions/setup-node@v2.2.0
+      uses: actions/setup-node@v2.3.0
       with:
         node-version: '12.x'
 
diff --git a/.github/workflows/ci-model-regression-on-schedule.yml b/.github/workflows/ci-model-regression-on-schedule.yml
index b301175144bb..2184d1a8f3d5 100644
--- a/.github/workflows/ci-model-regression-on-schedule.yml
+++ b/.github/workflows/ci-model-regression-on-schedule.yml
@@ -354,7 +354,7 @@ jobs:
 
       - name: Notify Slack of Failure 😱
         if: failure()  && steps.issue-exists.outputs.result == 'false'
-        uses: 8398a7/action-slack@f3635935f58910a6d6951b73efe9037c960c8c04  # v3
+        uses: 8398a7/action-slack@e74cd4e48f4452e8158dc4f8bcfc780ae6203364  # v3
         with:
           status: custom
           fields: workflow,job,commit,repo,ref,author,took
@@ -505,7 +505,7 @@ jobs:
 
       - name: Notify Slack when Performance Drops 💬
         if: steps.performance.outputs.is_dropped == 'true' && steps.issue-exists.outputs.result == 'false'
-        uses: 8398a7/action-slack@f3635935f58910a6d6951b73efe9037c960c8c04 #v3
+        uses: 8398a7/action-slack@e74cd4e48f4452e8158dc4f8bcfc780ae6203364 #v3
         with:
           status: custom
           fields: workflow,job,commit,repo,ref,author,took
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index 40802746622d..f4e455f38ea9 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -32,7 +32,7 @@ jobs:
         python-version: 3.7
 
     - name: Set up Node 12.x 🦙
-      uses: actions/setup-node@v2.2.0
+      uses: actions/setup-node@v2.3.0
       with:
         node-version: '12.x'
 
diff --git a/CHANGELOG.mdx b/CHANGELOG.mdx
index b1dff106517e..4b676cd09feb 100644
--- a/CHANGELOG.mdx
+++ b/CHANGELOG.mdx
@@ -1321,7 +1321,7 @@ https://github.com/RasaHQ/rasa/tree/main/changelog/ . -->
 
 
 ### Bugfixes
-- [#7089](https://github.com/rasahq/rasa/issues/7089): Fix [ConveRTTokenizer](components.mdx#converttokenizer) failing because of wrong model URL by making the `model_url` parameter of `ConveRTTokenizer` mandatory.
+- [#7089](https://github.com/rasahq/rasa/issues/7089): Fix `ConveRTTokenizer` failing because of wrong model URL by making the `model_url` parameter of `ConveRTTokenizer` mandatory.
 
   Since the ConveRT model was taken [offline](https://github.com/RasaHQ/rasa/issues/6806), we can no longer use
   the earlier public URL of the model. Additionally, since the licence for the model is unknown,
@@ -2362,7 +2362,7 @@ https://github.com/RasaHQ/rasa/tree/main/changelog/ . -->
 
 * [#5006](https://github.com/rasahq/rasa/issues/5006): Channel `hangouts` for Rasa integration with Google Hangouts Chat is now supported out-of-the-box.
 
-* [#5389](https://github.com/rasahq/rasa/issues/5389): Add an optional path to a specific directory to download and cache the pre-trained model weights for [HFTransformersNLP](./components.mdx#hftransformersnlp).
+* [#5389](https://github.com/rasahq/rasa/issues/5389): Add an optional path to a specific directory to download and cache the pre-trained model weights for `HFTransformersNLP`.
 
 * [#5422](https://github.com/rasahq/rasa/issues/5422): Add options `tensorboard_log_directory` and `tensorboard_log_level` to `EmbeddingIntentClassifier`,
   `DIETClasifier`, `ResponseSelector`, `EmbeddingPolicy` and `TEDPolicy`.
@@ -2529,10 +2529,10 @@ https://github.com/RasaHQ/rasa/tree/main/changelog/ . -->
 
 * [#5187](https://github.com/rasahq/rasa/issues/5187): Integrate language models from HuggingFace's [Transformers](https://github.com/huggingface/transformers) Library.
 
-  Add a new NLP component [HFTransformersNLP](./components.mdx#hftransformersnlp) which tokenizes and featurizes incoming messages using a specified
+  Add a new NLP component `HFTransformersNLP` which tokenizes and featurizes incoming messages using a specified
   pre-trained model with the Transformers library as the backend.
-  Add [LanguageModelTokenizer](./components.mdx#languagemodeltokenizer) and [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) which use the information from
-  [HFTransformersNLP](./components.mdx#hftransformersnlp) and sets them correctly for message object.
+  Add `LanguageModelTokenizer` and [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) which use the information from
+  `HFTransformersNLP` and sets them correctly for message object.
   Language models currently supported: BERT, OpenAIGPT, GPT-2, XLNet, DistilBert, RoBERTa.
 
 * [#5225](https://github.com/rasahq/rasa/issues/5225): Added a new CLI command `rasa export` to publish tracker events from a persistent
diff --git a/changelog/6487.removal.md b/changelog/6487.removal.md
new file mode 100644
index 000000000000..6e12da5f72c4
--- /dev/null
+++ b/changelog/6487.removal.md
@@ -0,0 +1,3 @@
+Remove backwards compatibility code with Rasa Open Source 1.x, Rasa Enterprise 0.35, and other outdated
+backwards compatibility code in `rasa.cli.x`, `rasa.core.utils`, `rasa.model_testing`, `rasa.model_training`
+and `rasa.shared.core.events`.
diff --git a/changelog/8879.removal.md b/changelog/8879.removal.md
new file mode 100644
index 000000000000..8991d1c3e5e7
--- /dev/null
+++ b/changelog/8879.removal.md
@@ -0,0 +1,3 @@
+Removed the deprecated dialogue policy state featurizers: `BinarySingleStateFeature` and `LabelTokenizerSingleStateFeaturizer`.
+
+Removed the deprecated method `encode_all_actions` of `SingleStateFeaturizer`. Use `encode_all_labels` instead.
diff --git a/changelog/8881.removal.md b/changelog/8881.removal.md
new file mode 100644
index 000000000000..6830d135708f
--- /dev/null
+++ b/changelog/8881.removal.md
@@ -0,0 +1 @@
+Follow through on deprecation warnings and remove code, tests, and docs for `ConveRTTokenizer`, `LanguageModelTokenizer` and `HFTransformersNLP`.
diff --git a/changelog/8929.improvement.md b/changelog/8929.improvement.md
new file mode 100644
index 000000000000..cfeb73400914
--- /dev/null
+++ b/changelog/8929.improvement.md
@@ -0,0 +1 @@
+Added optional flag to convert intent ID hashes from integer to string in the `KafkaEventBroker`.
\ No newline at end of file
diff --git a/changelog/9135.misc.md b/changelog/9135.misc.md
new file mode 100644
index 000000000000..48525b941d76
--- /dev/null
+++ b/changelog/9135.misc.md
@@ -0,0 +1,2 @@
+Remove `MessageProcessor` logic when determining whether to predict another action in `rasa.core.test` module.
+Adapt `MessageProcessor.predict_next_action()` method to raise `ActionLimitReached` exception instead.
diff --git a/data/test_dialogues/default.json b/data/test_dialogues/default.json
deleted file mode 100644
index e915b2c05cc3..000000000000
--- a/data/test_dialogues/default.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "py/object": "rasa.shared.core.conversation.Dialogue",
-  "name": "default",
-  "events": [
-    {
-      "py/object": "rasa.shared.core.events.ActionExecuted",
-      "action_name": "action_listen",
-      "action_text": null,
-      "confidence": null,
-      "policy": null,
-      "timestamp": 1551952977.4850519,
-      "unpredictable": false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object": "rasa.shared.core.events.UserUttered",
-      "entities": [
-        {
-          "end": 19,
-          "entity": "name",
-          "start": 14,
-          "value": "Peter"
-        }
-      ],
-      "input_channel": null,
-      "intent": {
-        "confidence": 0.0,
-        "name": "greet"
-      },
-      "message_id": null,
-      "parse_data": {
-        "entities": [
-          {
-            "end": 19,
-            "entity": "name",
-            "start": 14,
-            "value": "Peter"
-          }
-        ],
-        "intent": {
-          "confidence": 0.0,
-          "name": "greet"
-        },
-        "message_id": null,
-        "metadata": {},
-        "text": "Hi my name is Peter"
-      },
-      "text": "Hi my name is Peter",
-      "timestamp": 1551953035.076376,
-      "use_text_for_featurization": false
-    },
-    {
-      "py/object": "rasa.shared.core.events.SlotSet",
-      "key": "name",
-      "timestamp": 1551953035.076385,
-      "value": "Peter"
-    },
-    {
-      "py/object": "rasa.shared.core.events.ActionExecuted",
-      "action_name": "utter_greet",
-      "action_text": null,
-      "confidence": null,
-      "policy": null,
-      "timestamp": 1551953040.607782,
-      "unpredictable": false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object": "rasa.shared.core.events.BotUttered",
-      "data": {
-        "attachment": null,
-        "buttons": null,
-        "elements": null
-      },
-      "text": "hey there Peter!",
-      "timestamp": 1551953040.60779
-    }
-  ]
-}
diff --git a/data/test_dialogues/formbot.json b/data/test_dialogues/formbot.json
deleted file mode 100644
index 5a84c1b3c829..000000000000
--- a/data/test_dialogues/formbot.json
+++ /dev/null
@@ -1,214 +0,0 @@
-{
-  "py/object":"rasa.shared.core.conversation.Dialogue",
-  "name":"formbot",
-  "events":[
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"action_listen",
-      "action_text": null,
-      "confidence":null,
-      "policy":null,
-      "timestamp":1551884035.892855,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.UserUttered",
-      "entities":[
-
-      ],
-      "input_channel":null,
-      "intent":{
-        "confidence":0.3748943507671356,
-        "name":"greet"
-      },
-      "message_id":null,
-      "parse_data":{
-        "entities":[
-
-        ],
-        "intent":{
-          "confidence":0.3748943507671356,
-          "name":"greet"
-        },
-        "text":"Hi I'm desperate to talk to you"
-      },
-      "text":"Hi I'm desperate to talk to you",
-      "timestamp":1551884050.259948,
-      "use_text_for_featurization": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"utter_greet",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551884060.466681,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.BotUttered",
-      "data":{
-        "attachment":null,
-        "buttons":null,
-        "elements":null
-      },
-      "text":"Hello! I am restaurant search assistant! How can I help?",
-      "timestamp":1551884060.46669
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"action_listen",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551884061.9350882,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.UserUttered",
-      "entities":[
-        {
-          "end":18,
-          "entity":"cuisine",
-          "start":16,
-          "value":"an"
-        },
-        {
-          "end":48,
-          "entity":"location",
-          "start":42,
-          "value":"Bombay"
-        }
-      ],
-      "input_channel":null,
-      "intent":{
-        "confidence":0.9414282441139221,
-        "name":"request_restaurant"
-      },
-      "message_id":null,
-      "parse_data":{
-        "entities":[
-        {
-          "end":18,
-          "entity":"cuisine",
-          "start":16,
-          "value":"an"
-        },
-        {
-          "end":48,
-          "entity":"location",
-          "start":42,
-          "value":"Bombay"
-        }
-      ],
-        "intent":{
-          "confidence":0.9414282441139221,
-          "name":"request_restaurant"
-        },
-        "text":"I'm looking for an indian restaurant...in Bombay"
-      },
-      "text":"I'm looking for an indian restaurant...in Bombay",
-      "timestamp":1551884090.9653602,
-      "use_text_for_featurization": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"restaurant_form",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551884095.542748,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"utter_slots_values",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551884097.570883,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.BotUttered",
-      "data":{
-        "attachment":null,
-        "buttons":null,
-        "elements":null
-      },
-      "text":"I am going to run a restaurant search using the following parameters:\n - cuisine: None\n - num_people: None\n - outdoor_seating: None\n - preferences: None\n - feedback: None",
-      "timestamp":1551884097.57089
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"action_listen",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551884098.8006358,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.UserUttered",
-      "entities":[
-
-      ],
-      "input_channel":null,
-      "intent":{
-        "confidence":0.2287036031484604,
-        "name":"affirm"
-      },
-      "message_id":null,
-      "parse_data":{
-        "entities":[
-
-        ],
-        "intent":{
-          "confidence":0.2287036031484604,
-          "name":"affirm"
-        },
-        "text":"Let's just pretend everything went correctly"
-      },
-      "text":"Let's just pretend everything went correctly",
-      "timestamp":1551884208.092693,
-      "use_text_for_featurization": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"action_deactivate_loop",
-      "action_text": null,
-      "confidence":null,
-      "policy":null,
-      "timestamp":1551884214.951055,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActiveLoop",
-      "name":null,
-      "timestamp":1551884214.9510589
-    },
-    {
-      "py/object":"rasa.shared.core.events.SlotSet",
-      "key":"requested_slot",
-      "timestamp":1551884214.951062,
-      "value":null
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"action_listen",
-      "action_text": null,
-      "confidence":0.7680902069097734,
-      "policy":"policy_0_TEDPolicy",
-      "timestamp":1551884216.705635,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/test_dialogues/moodbot.json b/data/test_dialogues/moodbot.json
deleted file mode 100644
index e134a07535d1..000000000000
--- a/data/test_dialogues/moodbot.json
+++ /dev/null
@@ -1,278 +0,0 @@
-{
-  "py/object":"rasa.shared.core.conversation.Dialogue",
-  "name":"moodbot",
-  "events":[
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"action_listen",
-      "action_text": null,
-      "confidence":null,
-      "policy":null,
-      "timestamp":1551883958.346432,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.UserUttered",
-      "entities":[
-
-      ],
-      "input_channel":null,
-      "intent":{
-        "confidence":0.44488201660555066,
-        "name":"greet"
-      },
-      "message_id":null,
-      "parse_data":{
-        "entities":[
-
-        ],
-        "intent":{
-          "confidence":0.44488201660555066,
-          "name":"greet"
-        },
-        "intent_ranking":[
-          {
-            "confidence":0.44488201660555066,
-            "name":"greet"
-          },
-          {
-            "confidence":0.29023286595689257,
-            "name":"goodbye"
-          },
-          {
-            "confidence":0.10501227521380094,
-            "name":"mood_great"
-          },
-          {
-            "confidence":0.06879303900502878,
-            "name":"mood_unhappy"
-          },
-          {
-            "confidence":0.04903582960375451,
-            "name":"deny"
-          },
-          {
-            "confidence":0.04204397361497238,
-            "name":"affirm"
-          }
-        ],
-        "text":"Hi talk to me"
-      },
-      "text":"Hi talk to me",
-      "timestamp":1551883971.410778,
-      "use_text_for_featurization": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"utter_greet",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551883975.6456478,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.BotUttered",
-      "data":{
-        "attachment":null,
-        "buttons":[
-          {
-            "payload":"great",
-            "title":"great"
-          },
-          {
-            "payload":"super sad",
-            "title":"super sad"
-          }
-        ],
-        "elements":null
-      },
-      "text":"Hey! How are you?",
-      "timestamp":1551883975.645656
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"action_listen",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551883979.098331,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.UserUttered",
-      "entities":[
-
-      ],
-      "input_channel":null,
-      "intent":{
-        "confidence":0.7417646502470048,
-        "name":"mood_unhappy"
-      },
-      "message_id":null,
-      "parse_data":{
-        "entities":[
-
-        ],
-        "intent":{
-          "confidence":0.7417646502470048,
-          "name":"mood_unhappy"
-        },
-        "intent_ranking":[
-          {
-            "confidence":0.7417646502470048,
-            "name":"mood_unhappy"
-          },
-          {
-            "confidence":0.1439688162980615,
-            "name":"mood_great"
-          },
-          {
-            "confidence":0.04577343822867981,
-            "name":"goodbye"
-          },
-          {
-            "confidence":0.037760394267609965,
-            "name":"greet"
-          },
-          {
-            "confidence":0.017715563733253295,
-            "name":"affirm"
-          },
-          {
-            "confidence":0.013017137225390567,
-            "name":"deny"
-          }
-        ],
-        "text":"Super sad"
-      },
-      "text":"Super sad",
-      "timestamp":1551883982.540276,
-      "use_text_for_featurization": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"utter_cheer_up",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551883985.031668,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.BotUttered",
-      "data":{
-        "attachment":"https://i.imgur.com/nGF1K8f.jpg",
-        "buttons":null,
-        "elements":null
-      },
-      "text":"Here is something to cheer you up:",
-      "timestamp":1551883985.0316749
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"utter_did_that_help",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551883985.940413,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.BotUttered",
-      "data":{
-        "attachment":null,
-        "buttons":null,
-        "elements":null
-      },
-      "text":"Did that help you?",
-      "timestamp":1551883985.940421
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"action_listen",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551883986.958556,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.UserUttered",
-      "entities":[
-
-      ],
-      "input_channel":null,
-      "intent":{
-        "confidence":0.8162296627642036,
-        "name":"deny"
-      },
-      "message_id":null,
-      "parse_data":{
-        "entities":[
-
-        ],
-        "intent":{
-          "confidence":0.8162296627642036,
-          "name":"deny"
-        },
-        "intent_ranking":[
-          {
-            "confidence":0.8162296627642036,
-            "name":"deny"
-          },
-          {
-            "confidence":0.07152463661481759,
-            "name":"mood_unhappy"
-          },
-          {
-            "confidence":0.05028159510181415,
-            "name":"greet"
-          },
-          {
-            "confidence":0.02662414324721426,
-            "name":"affirm"
-          },
-          {
-            "confidence":0.024343883584915963,
-            "name":"goodbye"
-          },
-          {
-            "confidence":0.010996078687034375,
-            "name":"mood_great"
-          }
-        ],
-        "text":"No"
-      },
-      "text":"No",
-      "timestamp":1551883989.0720608,
-      "use_text_for_featurization": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.ActionExecuted",
-      "action_name":"utter_goodbye",
-      "action_text": null,
-      "confidence":1.0,
-      "policy":"policy_2_MemoizationPolicy",
-      "timestamp":1551883991.061463,
-      "unpredictable":false,
-      "hide_rule_turn": false
-    },
-    {
-      "py/object":"rasa.shared.core.events.BotUttered",
-      "data":{
-        "attachment":null,
-        "buttons":null,
-        "elements":null
-      },
-      "text":"Bye",
-      "timestamp":1551883991.061471
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/test_endpoints/event_brokers/kafka_sasl_plaintext_endpoint.yml b/data/test_endpoints/event_brokers/kafka_sasl_plaintext_endpoint.yml
index 7b9cb32dd863..3c76d71cea0e 100644
--- a/data/test_endpoints/event_brokers/kafka_sasl_plaintext_endpoint.yml
+++ b/data/test_endpoints/event_brokers/kafka_sasl_plaintext_endpoint.yml
@@ -7,3 +7,4 @@ event_broker:
   sasl_username: username
   sasl_password: password
   sasl_mechanism: PLAIN
+  convert_intent_id_to_string: True
diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx
index 17dec7ee71d2..4fc7b2a6d898 100644
--- a/docs/docs/components.mdx
+++ b/docs/docs/components.mdx
@@ -130,97 +130,6 @@ word vectors in your pipeline.
   attach spaCy models that you've trained yourself.
 
 
-### HFTransformersNLP
-
-:::caution Deprecated
-The `HFTransformersNLP` is deprecated and will be removed in a future release. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer)
-now implements its behavior.
-:::
-
-* **Short**
-
-  HuggingFace's Transformers based pre-trained language model initializer
-
-
-
-* **Outputs**
-
-  Nothing
-
-
-
-* **Requires**
-
-  Nothing
-
-
-
-* **Description**
-
-  Initializes specified pre-trained language model from HuggingFace's [Transformers library](https://huggingface.co/transformers/).  The component applies language model specific tokenization and
-  featurization to compute sequence and sentence level representations for each example in the training data.
-  Include [LanguageModelTokenizer](./components.mdx#languagemodeltokenizer) and [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) to utilize the output of this
-  component for downstream NLU models.
-
-  :::note
-  To use `HFTransformersNLP` component, install Rasa Open Source with `pip3 install rasa[transformers]`.
-
-  :::
-
-
-
-* **Configuration**
-
-  You should specify what language model to load via the parameter `model_name`. See the below table for the
-  available language models.
-  Additionally, you can also specify the architecture variation of the chosen language model by specifying the
-  parameter `model_weights`.
-  The full list of supported architectures can be found in the
-  [HuggingFace documentation](https://huggingface.co/transformers/pretrained_models.html).
-  If left empty, it uses the default model architecture that original Transformers library loads (see table below).
-
-  ```
-  +----------------+--------------+-------------------------+
-  | Language Model | Parameter    | Default value for       |
-  |                | "model_name" | "model_weights"         |
-  +----------------+--------------+-------------------------+
-  | BERT           | bert         | rasa/LaBSE              |
-  +----------------+--------------+-------------------------+
-  | GPT            | gpt          | openai-gpt              |
-  +----------------+--------------+-------------------------+
-  | GPT-2          | gpt2         | gpt2                    |
-  +----------------+--------------+-------------------------+
-  | XLNet          | xlnet        | xlnet-base-cased        |
-  +----------------+--------------+-------------------------+
-  | DistilBERT     | distilbert   | distilbert-base-uncased |
-  +----------------+--------------+-------------------------+
-  | RoBERTa        | roberta      | roberta-base            |
-  +----------------+--------------+-------------------------+
-  ```
-
-  The following configuration loads the language model BERT:
-
-  ```yaml-rasa
-  pipeline:
-    - name: HFTransformersNLP
-      # Name of the language model to use
-      model_name: "bert"
-      # Pre-Trained weights to be loaded
-      model_weights: "rasa/LaBSE"
-
-      # An optional path to a directory from which
-      # to load pre-trained model weights.
-      # If the requested model is not found in the
-      # directory, it will be downloaded and
-      # cached in this directory for future use.
-      # The default value of `cache_dir` can be
-      # set using the environment variable
-      # `TRANSFORMERS_CACHE`, as per the
-      # Transformers library.
-      cache_dir: null
-  ```
-
-
   ## Tokenizers
 
   Tokenizers split text into tokens.
@@ -428,108 +337,6 @@ now implements its behavior.
     ```
 
 
-  ### ConveRTTokenizer
-
-:::caution Deprecated
-The `ConveRTTokenizer` is deprecated and will be removed in a future release. The [ConveRTFeaturizer](./components.mdx#convertfeaturizer)
-should now be used with any other [tokenizer](./components.mdx#tokenizers), for example [WhitespaceTokenizer](./components.mdx#whitespacetokenizer).
-:::
-
-  * **Short**
-
-    Tokenizer using [ConveRT](https://github.com/PolyAI-LDN/polyai-models#convert) model.
-
-
-
-  * **Outputs**
-
-    `tokens` for user messages, responses (if present), and intents (if specified)
-
-
-
-  * **Requires**
-
-    Nothing
-
-
-
-  * **Description**
-
-    Creates tokens using the ConveRT tokenizer.
-
-    :::note
-    Since `ConveRT` model is trained only on an English corpus of conversations, this tokenizer should only
-    be used if your training data is in English language.
-
-    :::
-
-    :::note
-    To use `ConveRTTokenizer`, install Rasa Open Source with `pip3 install rasa[convert]`.
-
-    :::
-
-
-
-  * **Configuration**
-
-    ```yaml-rasa
-    pipeline:
-    - name: "ConveRTTokenizer"
-      # Flag to check whether to split intents
-      "intent_tokenization_flag": False
-      # Symbol on which intent should be split
-      "intent_split_symbol": "_"
-      # Regular expression to detect tokens
-      "token_pattern": None
-      # Remote URL/Local directory of model files(Required)
-      "model_url": None
-    ```
-
-
-
-  ### LanguageModelTokenizer
-
-:::caution Deprecated
-The `LanguageModelTokenizer` is deprecated and will be removed in a future release. The [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer)
-should now be used with any other [tokenizer](./components.mdx#tokenizers), for example [WhitespaceTokenizer](./components.mdx#whitespacetokenizer).
-:::
-
-* **Short**
-
-Tokenizer from pre-trained language models
-
-
-
-* **Outputs**
-
-`tokens` for user messages, responses (if present), and intents (if specified)
-
-
-
-* **Requires**
-
-[HFTransformersNLP](./components.mdx#hftransformersnlp)
-
-
-
-* **Description**
-
-Creates tokens using the pre-trained language model specified in upstream [HFTransformersNLP](./components.mdx#hftransformersnlp) component.
-
-
-
-* **Configuration**
-
-```yaml-rasa
-pipeline:
-- name: "LanguageModelTokenizer"
-  # Flag to check whether to split intents
-  "intent_tokenization_flag": False
-  # Symbol on which intent should be split
-  "intent_split_symbol": "_"
-```
-
-
 ## Featurizers
 
 Text featurizers are divided into two different categories: sparse featurizers and dense featurizers.
diff --git a/docs/docs/tuning-your-model.mdx b/docs/docs/tuning-your-model.mdx
index 8d1399e177cd..b0d711715807 100644
--- a/docs/docs/tuning-your-model.mdx
+++ b/docs/docs/tuning-your-model.mdx
@@ -230,7 +230,7 @@ for both is highly likely to be the same. This is also useful if you don't have
 
 An alternative to [ConveRTFeaturizer](./components.mdx#convertfeaturizer) is the [LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) which uses pre-trained language
 models such as BERT, GPT-2, etc. to extract similar contextual vector representations for the complete sentence. See
-[HFTransformersNLP](./components.mdx#hftransformersnlp) for a full list of supported language models.
+[LanguageModelFeaturizer](./components.mdx#languagemodelfeaturizer) for a full list of supported language models.
 
 If your training data is not in English you can also use a different variant of a language model which
 is pre-trained in the language specific to your training data.
diff --git a/rasa/cli/x.py b/rasa/cli/x.py
index 60fdb139ecf2..a284b91142d4 100644
--- a/rasa/cli/x.py
+++ b/rasa/cli/x.py
@@ -493,28 +493,14 @@ def run_locally(args: argparse.Namespace) -> None:
 
     # noinspection PyBroadException
     try:
-        try:
-            local.main(
-                args,
-                project_path,
-                args.data,
-                token=rasa_x_token,
-                config_path=config_path,
-                domain_path=domain_path,
-            )
-        except TypeError as e:
-            if "domain_path" in str(e):
-                # backwards compatibility with Rasa X versions < 0.35.0
-                # fixes regression https://github.com/RasaHQ/rasa/issues/7592
-                local.main(
-                    args,
-                    project_path,
-                    args.data,
-                    token=rasa_x_token,
-                    config_path=config_path,
-                )
-            else:
-                raise
+        local.main(
+            args,
+            project_path,
+            args.data,
+            token=rasa_x_token,
+            config_path=config_path,
+            domain_path=domain_path,
+        )
     except RasaXTermsError:
         # User didn't accept the Rasa X terms.
         pass
diff --git a/rasa/core/actions/forms.py b/rasa/core/actions/forms.py
index b1091fb507ba..c8cb1ffc7951 100644
--- a/rasa/core/actions/forms.py
+++ b/rasa/core/actions/forms.py
@@ -48,9 +48,11 @@ def __init__(
         self.action_endpoint = action_endpoint
         # creating it requires domain, which we don't have in init
         # we'll create it on the first call
-        self._unique_entity_mappings = None
+        self._unique_entity_mappings: Set[Text] = set()
+        self._have_unique_entity_mappings_been_initialized = False
 
     def name(self) -> Text:
+        """Return the form name."""
         return self._form_name
 
     def required_slots(self, domain: Domain) -> List[Text]:
@@ -160,9 +162,10 @@ def _create_unique_entity_mappings(self, domain: Domain) -> Set[Text]:
     def _entity_mapping_is_unique(
         self, slot_mapping: Dict[Text, Any], domain: Domain
     ) -> bool:
-        if self._unique_entity_mappings is None:
+        if not self._have_unique_entity_mappings_been_initialized:
             # create unique entity mappings on the first call
             self._unique_entity_mappings = self._create_unique_entity_mappings(domain)
+            self._have_unique_entity_mappings_been_initialized = True
 
         mapping_as_string = json.dumps(slot_mapping, sort_keys=True)
         return mapping_as_string in self._unique_entity_mappings
@@ -562,7 +565,7 @@ async def request_next_slot(
         events_so_far: List[Event],
     ) -> List[Union[SlotSet, Event]]:
         """Request the next slot and response if needed, else return `None`."""
-        request_slot_events: List[Union[SlotSet, Event]] = []
+        request_slot_events: List[Event] = []
 
         if await self.is_done(output_channel, nlg, tracker, domain, events_so_far):
             # The custom action for slot validation decided to stop the form early
diff --git a/rasa/core/brokers/kafka.py b/rasa/core/brokers/kafka.py
index 4233bbf916b0..2d8801bb5e67 100644
--- a/rasa/core/brokers/kafka.py
+++ b/rasa/core/brokers/kafka.py
@@ -34,6 +34,7 @@ def __init__(
         ssl_check_hostname: bool = False,
         security_protocol: Text = "SASL_PLAINTEXT",
         loglevel: Union[int, Text] = logging.ERROR,
+        convert_intent_id_to_string: bool = False,
         **kwargs: Any,
     ) -> None:
         """Kafka event broker.
@@ -68,7 +69,8 @@ def __init__(
             security_protocol: Protocol used to communicate with brokers.
                 Valid values are: PLAINTEXT, SSL, SASL_PLAINTEXT, SASL_SSL.
             loglevel: Logging level of the kafka logger.
-
+            convert_intent_id_to_string: Optional flag to configure whether intent ID's
+                are converted from an integer to a string.
         """
         import kafka
 
@@ -85,6 +87,7 @@ def __init__(
         self.ssl_certfile = ssl_certfile
         self.ssl_keyfile = ssl_keyfile
         self.ssl_check_hostname = ssl_check_hostname
+        self.convert_intent_id_to_string = convert_intent_id_to_string
 
         logging.getLogger("kafka").setLevel(loglevel)
 
@@ -107,6 +110,8 @@ def publish(
         retry_delay_in_seconds: float = 5,
     ) -> None:
         """Publishes events."""
+        if self.convert_intent_id_to_string:
+            event = self._convert_intent_id_to_string(event)
         if self.producer is None:
             self._create_producer()
             connected = self.producer.bootstrap_connected()
@@ -200,5 +205,17 @@ def _publish(self, event: Dict[Text, Any]) -> None:
         )
         self.producer.send(self.topic, value=event, key=partition_key)
 
+    def _convert_intent_id_to_string(self, event: Dict[Text, Any]) -> Dict[Text, Any]:
+        if event.get("event", "") == "user" and "id" in event.get("parse_data", {}).get(
+            "intent", {}
+        ):
+            event["parse_data"]["intent"]["id"] = str(
+                event["parse_data"]["intent"]["id"]
+            )
+            for idx, parse_data in enumerate(event["parse_data"]["intent_ranking"]):
+                parse_data["id"] = str(parse_data["id"])
+                event["parse_data"]["intent_ranking"][idx] = parse_data
+        return event
+
     def _close(self) -> None:
         self.producer.close()
diff --git a/rasa/core/channels/console.py b/rasa/core/channels/console.py
index 86a89630dbf8..f1f00576b2e2 100644
--- a/rasa/core/channels/console.py
+++ b/rasa/core/channels/console.py
@@ -61,26 +61,26 @@ def _print_bot_output(
             return question
 
     if "text" in message:
-        rasa.shared.utils.cli.print_color(message.get("text"), color=color)
+        rasa.shared.utils.cli.print_color(message["text"], color=color)
 
     if "image" in message:
-        rasa.shared.utils.cli.print_color("Image: " + message.get("image"), color=color)
+        rasa.shared.utils.cli.print_color("Image: " + message["image"], color=color)
 
     if "attachment" in message:
         rasa.shared.utils.cli.print_color(
-            "Attachment: " + message.get("attachment"), color=color
+            "Attachment: " + message["attachment"], color=color
         )
 
     if "elements" in message:
         rasa.shared.utils.cli.print_color("Elements:", color=color)
-        for idx, element in enumerate(message.get("elements")):
+        for idx, element in enumerate(message["elements"]):
             rasa.shared.utils.cli.print_color(
                 cli_utils.element_to_string(element, idx), color=color
             )
 
     if "quick_replies" in message:
         rasa.shared.utils.cli.print_color("Quick Replies:", color=color)
-        for idx, element in enumerate(message.get("quick_replies")):
+        for idx, element in enumerate(message["quick_replies"]):
             rasa.shared.utils.cli.print_color(
                 cli_utils.button_to_string(element, idx), color=color
             )
@@ -88,7 +88,7 @@ def _print_bot_output(
     if "custom" in message:
         rasa.shared.utils.cli.print_color("Custom json:", color=color)
         rasa.shared.utils.cli.print_color(
-            json.dumps(message.get("custom"), indent=2), color=color
+            json.dumps(message["custom"], indent=2), color=color
         )
 
     return None
diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 1d1f50279248..06f54ce7ba27 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -4,14 +4,12 @@
 from typing import List, Optional, Dict, Text, Set, Any
 from collections import defaultdict
 
-import rasa.shared.utils.io
 from rasa.nlu.extractors.extractor import EntityTagSpec
 from rasa.nlu.utils import bilou_utils
 from rasa.nlu.utils.bilou_utils import BILOU_PREFIXES
 from rasa.shared.core.domain import SubState, State, Domain
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter, RegexInterpreter
 from rasa.shared.core.constants import PREVIOUS_ACTION, ACTIVE_LOOP, USER, SLOTS
-from rasa.shared.constants import DOCS_URL_MIGRATION_GUIDE
 from rasa.shared.core.trackers import is_prev_action_listen_in_state
 from rasa.shared.nlu.constants import (
     ENTITIES,
@@ -371,30 +369,6 @@ def encode_all_labels(
             for action in domain.action_names_or_texts
         ]
 
-    def encode_all_actions(
-        self, domain: Domain, interpreter: NaturalLanguageInterpreter
-    ) -> List[Dict[Text, List[Features]]]:
-        """Encodes all actions from the domain using the given interpreter.
-
-        This method is deprecated and will be removed in Rasa Open Source 3.0.0 .
-        It is recommended to use `encode_all_labels` instead.
-
-        Args:
-            domain: The domain that contains the actions.
-            interpreter: The interpreter used to encode the actions.
-
-        Returns:
-            A list of encoded actions.
-        """
-        rasa.shared.utils.io.raise_deprecation_warning(
-            f"'{self.__class__.__name__}.encode_all_actions' is deprecated and "
-            f"will be removed in Rasa Open Source 3.0.0. "
-            f"It is recommended to use the method 'encode_all_labels' instead.",
-            docs=DOCS_URL_MIGRATION_GUIDE,
-        )
-
-        return self.encode_all_labels(domain, interpreter)
-
 
 class IntentTokenizerSingleStateFeaturizer(SingleStateFeaturizer):
     """A SingleStateFeaturizer for use with policies that predict intent labels."""
@@ -428,45 +402,3 @@ def encode_all_labels(
             A list of encoded labels.
         """
         return [self._encode_intent(intent, interpreter) for intent in domain.intents]
-
-
-class BinarySingleStateFeaturizer(SingleStateFeaturizer):
-    """Dialogue State featurizer which features the state as binaries."""
-
-    def __init__(self) -> None:
-        """Creates featurizer."""
-        super().__init__()
-        rasa.shared.utils.io.raise_deprecation_warning(
-            f"'{self.__class__.__name__}' is deprecated and "
-            f"will be removed in Rasa Open Source 3.0.0. "
-            f"It is recommended to use the '{SingleStateFeaturizer.__name__}' instead.",
-            docs=DOCS_URL_MIGRATION_GUIDE,
-        )
-
-    def _extract_state_features(
-        self,
-        sub_state: SubState,
-        interpreter: NaturalLanguageInterpreter,
-        sparse: bool = False,
-    ) -> Dict[Text, List[Features]]:
-        # create a special method that doesn't use passed interpreter
-        name_attribute = self._get_name_attribute(set(sub_state.keys()))
-        if name_attribute:
-            return {
-                name_attribute: self._create_features(sub_state, name_attribute, sparse)
-            }
-
-        return {}
-
-
-class LabelTokenizerSingleStateFeaturizer(SingleStateFeaturizer):
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__()
-        # it is hard to fully mimic old behavior, but SingleStateFeaturizer
-        # does the same thing if nlu pipeline is configured correctly
-        rasa.shared.utils.io.raise_deprecation_warning(
-            f"'{self.__class__.__name__}' is deprecated and "
-            f"will be removed in Rasa Open Source 3.0.0. "
-            f"It is recommended to use the '{SingleStateFeaturizer.__name__}' instead.",
-            docs=DOCS_URL_MIGRATION_GUIDE,
-        )
diff --git a/rasa/core/interpreter.py b/rasa/core/interpreter.py
index 851fcd9c1de3..fdd764b7e5ff 100644
--- a/rasa/core/interpreter.py
+++ b/rasa/core/interpreter.py
@@ -72,9 +72,10 @@ async def _rasa_http_parse(
         self, text: Text, message_id: Optional[Text] = None
     ) -> Optional[Dict[Text, Any]]:
         """Send a text message to a running rasa NLU http server.
-        Return `None` on failure."""
 
-        if not self.endpoint_config:
+        Return `None` on failure.
+        """
+        if not self.endpoint_config or self.endpoint_config.url is None:
             logger.error(
                 f"Failed to parse text '{text}' using rasa NLU over http. "
                 f"No rasa NLU server specified!"
diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py
index 0b3382955969..bcd5af6bb864 100644
--- a/rasa/core/policies/policy.py
+++ b/rasa/core/policies/policy.py
@@ -343,7 +343,7 @@ def _metadata(self) -> Optional[Dict[Text, Any]]:
         pass
 
     @classmethod
-    def _metadata_filename(cls) -> Optional[Text]:
+    def _metadata_filename(cls) -> Text:
         """Returns the filename of the persisted policy metadata.
 
         Policies using the default `persist()` and `load()` implementations must
diff --git a/rasa/core/policies/rule_policy.py b/rasa/core/policies/rule_policy.py
index b0f63a4df8a3..b091f7389985 100644
--- a/rasa/core/policies/rule_policy.py
+++ b/rasa/core/policies/rule_policy.py
@@ -907,20 +907,21 @@ def _find_action_from_default_actions(
         ):
             return None, None
 
-        default_action_name = DEFAULT_ACTION_MAPPINGS.get(
-            tracker.latest_message.intent.get(INTENT_NAME_KEY)
-        )
+        intent_name = tracker.latest_message.intent.get(INTENT_NAME_KEY)
+        if intent_name is None:
+            return None, None
 
-        if default_action_name:
-            logger.debug(f"Predicted default action '{default_action_name}'.")
-            return (
-                default_action_name,
-                # create prediction source that corresponds to one of
-                # default prediction sources in `_default_sources()`
-                DEFAULT_RULES + tracker.latest_message.intent.get(INTENT_NAME_KEY),
-            )
+        default_action_name = DEFAULT_ACTION_MAPPINGS.get(intent_name)
+        if default_action_name is None:
+            return None, None
 
-        return None, None
+        logger.debug(f"Predicted default action '{default_action_name}'.")
+        return (
+            default_action_name,
+            # create prediction source that corresponds to one of
+            # default prediction sources in `_default_sources()`
+            DEFAULT_RULES + intent_name,
+        )
 
     @staticmethod
     def _find_action_from_loop_happy_path(
@@ -928,16 +929,16 @@ def _find_action_from_loop_happy_path(
     ) -> Tuple[Optional[Text], Optional[Text]]:
 
         active_loop_name = tracker.active_loop_name
+        if active_loop_name is None:
+            return None, None
+
         active_loop_rejected = tracker.active_loop.get(LOOP_REJECTED)
         should_predict_loop = (
-            active_loop_name
-            and not active_loop_rejected
+            not active_loop_rejected
             and tracker.latest_action.get(ACTION_NAME) != active_loop_name
         )
         should_predict_listen = (
-            active_loop_name
-            and not active_loop_rejected
-            and tracker.latest_action_name == active_loop_name
+            not active_loop_rejected and tracker.latest_action_name == active_loop_name
         )
 
         if should_predict_loop:
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index e596e02c11d6..02c264ce2265 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -198,8 +198,9 @@ class TEDPolicy(Policy):
         KEY_RELATIVE_ATTENTION: False,
         # If 'True' use value relative embeddings in attention
         VALUE_RELATIVE_ATTENTION: False,
-        # Max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
+        # Max position for relative embeddings. Only in effect if key- or value relative
+        # attention are turned on
+        MAX_RELATIVE_POSITION: 5,
         # Use a unidirectional or bidirectional encoder
         # for `text`, `action_text`, and `label_action_text`.
         UNIDIRECTIONAL_ENCODER: False,
diff --git a/rasa/core/policies/unexpected_intent_policy.py b/rasa/core/policies/unexpected_intent_policy.py
index c0b2a1dd1bf5..a0976a4dd887 100644
--- a/rasa/core/policies/unexpected_intent_policy.py
+++ b/rasa/core/policies/unexpected_intent_policy.py
@@ -165,8 +165,9 @@ class UnexpecTEDIntentPolicy(TEDPolicy):
         KEY_RELATIVE_ATTENTION: False,
         # If 'True' use value relative embeddings in attention
         VALUE_RELATIVE_ATTENTION: False,
-        # Max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
+        # Max position for relative embeddings. Only in effect if key- or value relative
+        # attention are turned on
+        MAX_RELATIVE_POSITION: 5,
         # Use a unidirectional or bidirectional encoder
         # for `text`, `action_text`, and `label_action_text`.
         UNIDIRECTIONAL_ENCODER: False,
diff --git a/rasa/core/processor.py b/rasa/core/processor.py
index 756cdfa3972f..9db3f3d74caf 100644
--- a/rasa/core/processor.py
+++ b/rasa/core/processor.py
@@ -15,6 +15,7 @@
 )
 import rasa.core.utils
 from rasa.core.policies.policy import PolicyPrediction
+from rasa.exceptions import ActionLimitReached
 from rasa.shared.core.constants import (
     USER_INTENT_RESTART,
     ACTION_LISTEN_NAME,
@@ -33,6 +34,7 @@
     ReminderScheduled,
     SlotSet,
     UserUttered,
+    ActionExecuted,
 )
 from rasa.shared.core.slots import Slot
 from rasa.shared.core.training_data.story_reader.yaml_story_reader import (
@@ -374,8 +376,23 @@ def predict_next_action(
         """Predicts the next action the bot should take after seeing x.
 
         This should be overwritten by more advanced policies to use
-        ML to predict the action. Returns the index of the next action.
+        ML to predict the action.
+
+        Returns:
+             The index of the next action and prediction of the policy.
+
+        Raises:
+            ActionLimitReached if the limit of actions to predict has been reached.
         """
+        should_predict_another_action = self.should_predict_another_action(
+            tracker.latest_action_name
+        )
+
+        if self.is_action_limit_reached(tracker, should_predict_another_action):
+            raise ActionLimitReached(
+                "The limit of actions to predict has been reached."
+            )
+
         prediction = self._get_next_action_probabilities(tracker)
 
         action = rasa.core.actions.action.action_for_index(
@@ -623,18 +640,27 @@ def _should_handle_message(tracker: DialogueStateTracker) -> bool:
         )
 
     def is_action_limit_reached(
-        self, num_predicted_actions: int, should_predict_another_action: bool
+        self, tracker: DialogueStateTracker, should_predict_another_action: bool,
     ) -> bool:
         """Check whether the maximum number of predictions has been met.
 
         Args:
-            num_predicted_actions: Number of predicted actions.
+            tracker: instance of DialogueStateTracker.
             should_predict_another_action: Whether the last executed action allows
             for more actions to be predicted or not.
 
         Returns:
             `True` if the limit of actions to predict has been reached.
         """
+        reversed_events = list(tracker.events)[::-1]
+        num_predicted_actions = 0
+
+        for e in reversed_events:
+            if isinstance(e, ActionExecuted):
+                if e.action_name in (ACTION_LISTEN_NAME, ACTION_SESSION_START_NAME):
+                    break
+                num_predicted_actions += 1
+
         return (
             num_predicted_actions >= self.max_number_of_predictions
             and should_predict_another_action
@@ -645,33 +671,25 @@ async def _predict_and_execute_next_action(
     ) -> None:
         # keep taking actions decided by the policy until it chooses to 'listen'
         should_predict_another_action = True
-        num_predicted_actions = 0
 
         # action loop. predicts actions until we hit action listen
-        while (
-            should_predict_another_action
-            and self._should_handle_message(tracker)
-            and num_predicted_actions < self.max_number_of_predictions
-        ):
+        while should_predict_another_action and self._should_handle_message(tracker):
             # this actually just calls the policy's method by the same name
-            action, prediction = self.predict_next_action(tracker)
+            try:
+                action, prediction = self.predict_next_action(tracker)
+            except ActionLimitReached:
+                logger.warning(
+                    "Circuit breaker tripped. Stopped predicting "
+                    f"more actions for sender '{tracker.sender_id}'."
+                )
+                if self.on_circuit_break:
+                    # call a registered callback
+                    self.on_circuit_break(tracker, output_channel, self.nlg)
+                break
 
             should_predict_another_action = await self._run_action(
                 action, tracker, output_channel, self.nlg, prediction
             )
-            num_predicted_actions += 1
-
-        if self.is_action_limit_reached(
-            num_predicted_actions, should_predict_another_action
-        ):
-            # circuit breaker was tripped
-            logger.warning(
-                "Circuit breaker tripped. Stopped predicting "
-                f"more actions for sender '{tracker.sender_id}'."
-            )
-            if self.on_circuit_break:
-                # call a registered callback
-                self.on_circuit_break(tracker, output_channel, self.nlg)
 
     @staticmethod
     def should_predict_another_action(action_name: Text) -> bool:
diff --git a/rasa/core/test.py b/rasa/core/test.py
index 42e7ae094369..6a5d445f5dbd 100644
--- a/rasa/core/test.py
+++ b/rasa/core/test.py
@@ -56,6 +56,7 @@
 from rasa.shared.importers.importer import TrainingDataImporter
 from rasa.shared.utils.io import DEFAULT_ENCODING
 from rasa.utils.tensorflow.constants import QUERY_INTENT_KEY, SEVERITY_KEY
+from rasa.exceptions import ActionLimitReached
 
 if typing.TYPE_CHECKING:
     from rasa.core.agent import Agent
@@ -271,8 +272,8 @@ def serialise(self) -> Tuple[PredictionList, PredictionList]:
             )
         )
 
-        aligned_entity_targets = []
-        aligned_entity_predictions = []
+        aligned_entity_targets: List[Optional[Text]] = []
+        aligned_entity_predictions: List[Optional[Text]] = []
 
         for text in texts:
             # sort the entities of this sentence to compare them directly
@@ -637,7 +638,6 @@ def _collect_action_executed_predictions(
     partial_tracker: DialogueStateTracker,
     event: ActionExecuted,
     fail_on_prediction_errors: bool,
-    circuit_breaker_tripped: bool,
 ) -> Tuple[EvaluationStore, PolicyPrediction, Optional[EntityEvaluationResult]]:
 
     action_executed_eval_store = EvaluationStore()
@@ -649,13 +649,13 @@ def _collect_action_executed_predictions(
     policy_entity_result = None
     prev_action_unlikely_intent = False
 
-    if circuit_breaker_tripped:
-        prediction = PolicyPrediction([], policy_name=None)
-        predicted_action = "circuit breaker tripped"
-    else:
+    try:
         predicted_action, prediction, policy_entity_result = _run_action_prediction(
             processor, partial_tracker, expected_action
         )
+    except ActionLimitReached:
+        prediction = PolicyPrediction([], policy_name=None)
+        predicted_action = "circuit breaker tripped"
 
     predicted_action_unlikely_intent = predicted_action == ACTION_UNLIKELY_INTENT_NAME
     if predicted_action_unlikely_intent and predicted_action != expected_action:
@@ -671,9 +671,14 @@ def _collect_action_executed_predictions(
             )
         )
         prev_action_unlikely_intent = True
-        predicted_action, prediction, policy_entity_result = _run_action_prediction(
-            processor, partial_tracker, expected_action
-        )
+
+        try:
+            predicted_action, prediction, policy_entity_result = _run_action_prediction(
+                processor, partial_tracker, expected_action
+            )
+        except ActionLimitReached:
+            prediction = PolicyPrediction([], policy_name=None)
+            predicted_action = "circuit breaker tripped"
 
     action_executed_eval_store.add_to_store(
         action_predictions=[predicted_action], action_targets=[expected_action]
@@ -761,25 +766,16 @@ async def _predict_tracker_actions(
     )
 
     tracker_actions = []
-    should_predict_another_action = True
-    num_predicted_actions = 0
     policy_entity_results = []
 
     for event in events[1:]:
         if isinstance(event, ActionExecuted):
-            circuit_breaker_tripped = processor.is_action_limit_reached(
-                num_predicted_actions, should_predict_another_action
-            )
             (
                 action_executed_result,
                 prediction,
                 entity_result,
             ) = _collect_action_executed_predictions(
-                processor,
-                partial_tracker,
-                event,
-                fail_on_prediction_errors,
-                circuit_breaker_tripped,
+                processor, partial_tracker, event, fail_on_prediction_errors,
             )
 
             if entity_result:
@@ -795,10 +791,6 @@ async def _predict_tracker_actions(
                         "confidence": prediction.max_confidence,
                     }
                 )
-                should_predict_another_action = processor.should_predict_another_action(
-                    action_executed_result.action_predictions[0]
-                )
-                num_predicted_actions += 1
 
         elif use_e2e and isinstance(event, UserUttered):
             # This means that user utterance didn't have a user message, only intent,
@@ -818,8 +810,6 @@ async def _predict_tracker_actions(
             tracker_eval_store.merge_store(user_uttered_result)
         else:
             partial_tracker.update(event)
-        if isinstance(event, UserUttered):
-            num_predicted_actions = 0
 
     return tracker_eval_store, partial_tracker, tracker_actions, policy_entity_results
 
diff --git a/rasa/core/training/interactive.py b/rasa/core/training/interactive.py
index a55a3fecf20e..6adfbe7a9ccd 100644
--- a/rasa/core/training/interactive.py
+++ b/rasa/core/training/interactive.py
@@ -270,13 +270,13 @@ def format_bot_output(message: BotUttered) -> Text:
     if not data:
         return output
 
-    if data.get("image"):
-        output += "\nImage: " + data.get("image")
+    if "image" in data and data["image"] is not None:
+        output += "\nImage: " + data["image"]
 
-    if data.get("attachment"):
-        output += "\nAttachment: " + data.get("attachment")
+    if "attachment" in data and data["attachment"] is not None:
+        output += "\nAttachment: " + data["attachment"]
 
-    if data.get("buttons"):
+    if "buttons" in data and data["buttons"] is not None:
         output += "\nButtons:"
         choices = rasa.cli.utils.button_choices_from_message_data(
             data, allow_free_text_input=True
@@ -284,15 +284,15 @@ def format_bot_output(message: BotUttered) -> Text:
         for choice in choices:
             output += "\n" + choice
 
-    if data.get("elements"):
+    if "elements" in data and data["elements"] is not None:
         output += "\nElements:"
-        for idx, element in enumerate(data.get("elements")):
+        for idx, element in enumerate(data["elements"]):
             element_str = rasa.cli.utils.element_to_string(element, idx)
             output += "\n" + element_str
 
-    if data.get("quick_replies"):
+    if "quick_replies" in data and data["quick_replies"] is not None:
         output += "\nQuick replies:"
-        for idx, element in enumerate(data.get("quick_replies")):
+        for idx, element in enumerate(data["quick_replies"]):
             element_str = rasa.cli.utils.element_to_string(element, idx)
             output += "\n" + element_str
     return output
@@ -667,10 +667,7 @@ async def _request_action_from_user(
     await _print_history(conversation_id, endpoint)
 
     choices = [
-        {
-            "name": f'{a.get("score"):03.2f} {a.get("action"):40}',
-            "value": a.get("action"),
-        }
+        {"name": f'{a["score"]:03.2f} {a["action"]:40}', "value": a["action"],}
         for a in predictions
     ]
 
diff --git a/rasa/core/utils.py b/rasa/core/utils.py
index 3b94f849a20c..95a940b4dcca 100644
--- a/rasa/core/utils.py
+++ b/rasa/core/utils.py
@@ -19,8 +19,6 @@
 from rasa.constants import DEFAULT_SANIC_WORKERS, ENV_SANIC_WORKERS
 from rasa.shared.constants import DEFAULT_ENDPOINTS_PATH
 
-# backwards compatibility 1.0.x
-# noinspection PyUnresolvedReferences
 from rasa.core.lock_store import LockStore, RedisLockStore, InMemoryLockStore
 from rasa.utils.endpoints import EndpointConfig, read_endpoint_config
 from sanic import Sanic
diff --git a/rasa/exceptions.py b/rasa/exceptions.py
index 93794bbc8284..00692c0debac 100644
--- a/rasa/exceptions.py
+++ b/rasa/exceptions.py
@@ -37,3 +37,7 @@ def __init__(self, timestamp: float) -> None:
     def __str__(self) -> Text:
         """Returns string representation of exception."""
         return str(self.timestamp)
+
+
+class ActionLimitReached(RasaException):
+    """Raised when predicted action limit is reached."""
diff --git a/rasa/model.py b/rasa/model.py
index 456dca7da184..393ec37cca75 100644
--- a/rasa/model.py
+++ b/rasa/model.py
@@ -386,7 +386,8 @@ def _get_fingerprint_of_config(
     if not config:
         return ""
 
-    keys = include_keys or list(filter(lambda k: k not in exclude_keys, config.keys()))
+    exclude_keys = exclude_keys or []
+    keys = include_keys or [k for k in config.keys() if k not in exclude_keys]
 
     sub_config = {k: config[k] for k in keys if k in config}
 
diff --git a/rasa/model_testing.py b/rasa/model_testing.py
index 584c03363181..ee60ed25cfc8 100644
--- a/rasa/model_testing.py
+++ b/rasa/model_testing.py
@@ -127,10 +127,6 @@ def test_core_models(
     )
 
 
-# backwards compatibility
-test = rasa.test
-
-
 def test_core(
     model: Optional[Text] = None,
     stories: Optional[Text] = None,
diff --git a/rasa/model_training.py b/rasa/model_training.py
index 3acabed1089f..2c9930f2150f 100644
--- a/rasa/model_training.py
+++ b/rasa/model_training.py
@@ -51,10 +51,6 @@ class TrainingResult(NamedTuple):
     code: int = 0
 
 
-# backwards compatibility
-train = rasa.train
-
-
 async def train_async(
     domain: Union[Domain, Text],
     config: Text,
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index ffbf38bffbcc..810ecb7ab8b2 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -8,7 +8,7 @@
 import scipy.sparse
 import tensorflow as tf
 
-from typing import Any, Dict, List, Optional, Text, Tuple, TypeVar, Union, Type
+from typing import Any, Dict, List, Optional, Text, Tuple, Union, Type
 
 import rasa.shared.utils.io
 import rasa.utils.io as io_utils
@@ -147,8 +147,9 @@ def required_components(cls) -> List[Type[Component]]:
         KEY_RELATIVE_ATTENTION: False,
         # If 'True' use value relative embeddings in attention
         VALUE_RELATIVE_ATTENTION: False,
-        # Max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
+        # Max position for relative embeddings. Only in effect if key- or value relative
+        # attention are turned on
+        MAX_RELATIVE_POSITION: 5,
         # Use a unidirectional or bidirectional encoder.
         UNIDIRECTIONAL_ENCODER: False,
         # ## Training parameters
@@ -1011,27 +1012,27 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         if self.model is None:
             return {"file": None}
 
-        model_dir = Path(model_dir)
-        tf_model_file = model_dir / f"{file_name}.tf_model"
+        model_dir_path = Path(model_dir)
+        tf_model_file = model_dir_path / f"{file_name}.tf_model"
 
         rasa.shared.utils.io.create_directory_for_file(tf_model_file)
 
         if self.component_config[CHECKPOINT_MODEL]:
-            shutil.move(self.tmp_checkpoint_dir, model_dir / "checkpoints")
+            shutil.move(self.tmp_checkpoint_dir, model_dir_path / "checkpoints")
         self.model.save(str(tf_model_file))
 
         io_utils.pickle_dump(
-            model_dir / f"{file_name}.data_example.pkl", self._data_example
+            model_dir_path / f"{file_name}.data_example.pkl", self._data_example
         )
         io_utils.pickle_dump(
-            model_dir / f"{file_name}.sparse_feature_sizes.pkl",
+            model_dir_path / f"{file_name}.sparse_feature_sizes.pkl",
             self._sparse_feature_sizes,
         )
         io_utils.pickle_dump(
-            model_dir / f"{file_name}.label_data.pkl", dict(self._label_data.data)
+            model_dir_path / f"{file_name}.label_data.pkl", dict(self._label_data.data)
         )
         io_utils.json_pickle(
-            model_dir / f"{file_name}.index_label_id_mapping.json",
+            model_dir_path / f"{file_name}.index_label_id_mapping.json",
             self.index_label_id_mapping,
         )
 
@@ -1041,23 +1042,21 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             else []
         )
         rasa.shared.utils.io.dump_obj_as_json_to_file(
-            model_dir / f"{file_name}.entity_tag_specs.json", entity_tag_specs
+            model_dir_path / f"{file_name}.entity_tag_specs.json", entity_tag_specs
         )
 
         return {"file": file_name}
 
-    T = TypeVar("T")
-
     @classmethod
     def load(
-        cls: T,
+        cls,
         meta: Dict[Text, Any],
         model_dir: Text,
         model_metadata: Metadata = None,
         cached_component: Optional["DIETClassifier"] = None,
         should_finetune: bool = False,
         **kwargs: Any,
-    ) -> T:
+    ) -> "DIETClassifier":
         """Loads the trained model from the provided directory."""
         if not meta.get("file"):
             logger.debug(
@@ -1110,21 +1109,25 @@ def _load_from_files(
         Dict[Text, Dict[Text, List[FeatureArray]]],
         Dict[Text, Dict[Text, List[int]]],
     ]:
-        file_name = meta.get("file")
+        file_name = meta["file"]
 
-        model_dir = Path(model_dir)
+        model_dir_path = Path(model_dir)
 
-        data_example = io_utils.pickle_load(model_dir / f"{file_name}.data_example.pkl")
-        label_data = io_utils.pickle_load(model_dir / f"{file_name}.label_data.pkl")
+        data_example = io_utils.pickle_load(
+            model_dir_path / f"{file_name}.data_example.pkl"
+        )
+        label_data = io_utils.pickle_load(
+            model_dir_path / f"{file_name}.label_data.pkl"
+        )
         label_data = RasaModelData(data=label_data)
         sparse_feature_sizes = io_utils.pickle_load(
-            model_dir / f"{file_name}.sparse_feature_sizes.pkl"
+            model_dir_path / f"{file_name}.sparse_feature_sizes.pkl"
         )
         index_label_id_mapping = io_utils.json_unpickle(
-            model_dir / f"{file_name}.index_label_id_mapping.json"
+            model_dir_path / f"{file_name}.index_label_id_mapping.json"
         )
         entity_tag_specs = rasa.shared.utils.io.read_json_file(
-            model_dir / f"{file_name}.entity_tag_specs.json"
+            model_dir_path / f"{file_name}.entity_tag_specs.json"
         )
         entity_tag_specs = [
             EntityTagSpec(
@@ -1164,7 +1167,7 @@ def _load_model(
         model_dir: Text,
         finetune_mode: bool = False,
     ) -> "RasaModel":
-        file_name = meta.get("file")
+        file_name = meta["file"]
         tf_model_file = os.path.join(model_dir, file_name + ".tf_model")
 
         label_key = LABEL_KEY if meta[INTENT_CLASSIFICATION] else None
diff --git a/rasa/nlu/classifiers/keyword_intent_classifier.py b/rasa/nlu/classifiers/keyword_intent_classifier.py
index 06bfbd107067..2813417afec9 100644
--- a/rasa/nlu/classifiers/keyword_intent_classifier.py
+++ b/rasa/nlu/classifiers/keyword_intent_classifier.py
@@ -145,20 +145,14 @@ def load(
         **kwargs: Any,
     ) -> "KeywordIntentClassifier":
         """Loads trained component (see parent class for full docstring)."""
-        if meta.get("file"):
-            file_name = meta.get("file")
-            keyword_file = os.path.join(model_dir, file_name)
-            if os.path.exists(keyword_file):
-                intent_keyword_map = rasa.shared.utils.io.read_json_file(keyword_file)
-            else:
-                rasa.shared.utils.io.raise_warning(
-                    f"Failed to load key word file for `IntentKeywordClassifier`, "
-                    f"maybe {keyword_file} does not exist?"
-                )
-                intent_keyword_map = None
-            return cls(meta, intent_keyword_map)
+        file_name = meta["file"]
+        keyword_file = os.path.join(model_dir, file_name)
+        if os.path.exists(keyword_file):
+            intent_keyword_map = rasa.shared.utils.io.read_json_file(keyword_file)
         else:
-            raise Exception(
-                f"Failed to load keyword intent classifier model. "
-                f"Path {os.path.abspath(meta.get('file'))} doesn't exist."
+            rasa.shared.utils.io.raise_warning(
+                f"Failed to load key word file for `IntentKeywordClassifier`, "
+                f"maybe {keyword_file} does not exist?"
             )
+            intent_keyword_map = None
+        return cls(meta, intent_keyword_map)
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 5a76bb404883..61f496c353a6 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -251,8 +251,8 @@ def load(
         """Loads trained component (see parent class for full docstring)."""
         from sklearn.preprocessing import LabelEncoder
 
-        classifier_file = os.path.join(model_dir, meta.get("classifier"))
-        encoder_file = os.path.join(model_dir, meta.get("encoder"))
+        classifier_file = os.path.join(model_dir, meta["classifier"])
+        encoder_file = os.path.join(model_dir, meta["encoder"])
 
         if os.path.exists(classifier_file):
             classifier = io_utils.json_unpickle(classifier_file)
diff --git a/rasa/nlu/extractors/regex_entity_extractor.py b/rasa/nlu/extractors/regex_entity_extractor.py
index 42920c7cedaa..dfa3dc5dda07 100644
--- a/rasa/nlu/extractors/regex_entity_extractor.py
+++ b/rasa/nlu/extractors/regex_entity_extractor.py
@@ -119,7 +119,7 @@ def load(
         **kwargs: Any,
     ) -> "RegexEntityExtractor":
         """Loads trained component (see parent class for full docstring)."""
-        file_name = meta.get("file")
+        file_name = meta["file"]
         regex_file = os.path.join(model_dir, file_name)
 
         if os.path.exists(regex_file):
diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
index 0374751ea2d3..b64ce0ecdb48 100644
--- a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -21,7 +21,6 @@
     NO_LENGTH_RESTRICTION,
     NUMBER_OF_SUB_TOKENS,
     TOKENS_NAMES,
-    LANGUAGE_MODEL_DOCS,
 )
 from rasa.shared.nlu.constants import (
     TEXT,
@@ -71,19 +70,14 @@ def __init__(
         self,
         component_config: Optional[Dict[Text, Any]] = None,
         skip_model_load: bool = False,
-        hf_transformers_loaded: bool = False,
     ) -> None:
         """Initializes LanguageModelFeaturizer with the specified model.
 
         Args:
             component_config: Configuration for the component.
             skip_model_load: Skip loading the model for pytests.
-            hf_transformers_loaded: Skip loading of model and metadata, use
-            HFTransformers output instead.
         """
         super(LanguageModelFeaturizer, self).__init__(component_config)
-        if hf_transformers_loaded:
-            return
         self._load_model_metadata()
         self._load_model_instance(skip_model_load)
 
@@ -95,52 +89,7 @@ def create(
         if not cls.can_handle_language(language):
             # check failed
             raise UnsupportedLanguageError(cls.name, language)
-        # TODO: remove this when HFTransformersNLP is removed for good
-        if isinstance(config, Metadata):
-            hf_transformers_loaded = "HFTransformersNLP" in [
-                c["name"] for c in config.metadata["pipeline"]
-            ]
-        else:
-            hf_transformers_loaded = "HFTransformersNLP" in config.component_names
-        return cls(component_config, hf_transformers_loaded=hf_transformers_loaded)
-
-    @classmethod
-    def load(
-        cls,
-        meta: Dict[Text, Any],
-        model_dir: Text,
-        model_metadata: Optional["Metadata"] = None,
-        cached_component: Optional["Component"] = None,
-        **kwargs: Any,
-    ) -> "Component":
-        """Load this component from file.
-
-        After a component has been trained, it will be persisted by
-        calling `persist`. When the pipeline gets loaded again,
-        this component needs to be able to restore itself.
-        Components can rely on any context attributes that are
-        created by :meth:`components.Component.create`
-        calls to components previous to this one.
-
-        This method differs from the parent method only in that it calls create
-        rather than the constructor if the component is not found. This is to
-        trigger the check for HFTransformersNLP and the method can be removed
-        when HFTRansformersNLP is removed.
-
-        Args:
-                meta: Any configuration parameter related to the model.
-                model_dir: The directory to load the component from.
-                model_metadata: The model's :class:`rasa.nlu.model.Metadata`.
-                cached_component: The cached component.
-
-        Returns:
-                the loaded component
-        """
-        # TODO: remove this when HFTransformersNLP is removed for good
-        if cached_component:
-            return cached_component
-
-        return cls.create(meta, model_metadata)
+        return cls(component_config)
 
     def _load_model_metadata(self) -> None:
         """Load the metadata for the specified model and sets these properties.
@@ -219,7 +168,7 @@ def cache_key(
 
         Returns: key of the cache for future retrievals.
         """
-        weights = component_meta.get("model_weights") or {}
+        weights = component_meta.get("model_weights", {})
 
         return (
             f"{cls.name}-{component_meta.get('model_name')}-"
@@ -744,19 +693,6 @@ def _get_docs_for_batch(
         Returns:
             List of language model docs for each message in batch.
         """
-        hf_transformers_doc = batch_examples[0].get(LANGUAGE_MODEL_DOCS[attribute])
-        if hf_transformers_doc:
-            # This should only be the case if the deprecated
-            # HFTransformersNLP component is used in the pipeline
-            # TODO: remove this when HFTransformersNLP is removed for good
-            logging.debug(
-                f"'{LANGUAGE_MODEL_DOCS[attribute]}' set: this "
-                f"indicates you're using the deprecated component "
-                f"HFTransformersNLP, please remove it from your "
-                f"pipeline."
-            )
-            return [ex.get(LANGUAGE_MODEL_DOCS[attribute]) for ex in batch_examples]
-
         batch_tokens, batch_token_ids = self._get_token_ids_for_batch(
             batch_examples, attribute
         )
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index a767a7c3f091..29cceb376167 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -165,28 +165,11 @@ def _load_vocabulary_params(self) -> None:
                 f"`additional_vocabulary_size` in future runs."
             )
 
-    def _check_attribute_vocabulary(self, attribute: Text) -> bool:
-        """Checks if trained vocabulary exists in attribute's count vectorizer."""
-        try:
-            return hasattr(self.vectorizers[attribute], "vocabulary_")
-        except (AttributeError, KeyError):
-            return False
-
     def _get_attribute_vocabulary(self, attribute: Text) -> Optional[Dict[Text, int]]:
-        """Get trained vocabulary from attribute's count vectorizer"""
-
+        """Gets trained vocabulary from attribute's count vectorizer."""
         try:
             return self.vectorizers[attribute].vocabulary_
-        except (AttributeError, TypeError):
-            return None
-
-    def _get_attribute_vocabulary_tokens(self, attribute: Text) -> Optional[List[Text]]:
-        """Get all keys of vocabulary of an attribute"""
-
-        attribute_vocabulary = self._get_attribute_vocabulary(attribute)
-        try:
-            return list(attribute_vocabulary.keys())
-        except TypeError:
+        except (AttributeError, TypeError, KeyError):
             return None
 
     def _check_analyzer(self) -> None:
@@ -275,18 +258,14 @@ def _process_tokens(self, tokens: List[Text], attribute: Text = TEXT) -> List[Te
     def _replace_with_oov_token(
         self, tokens: List[Text], attribute: Text
     ) -> List[Text]:
-        """Replace OOV words with OOV token"""
-
+        """Replace OOV words with OOV token."""
         if self.OOV_token and self.analyzer == "word":
-            vocabulary_exists = self._check_attribute_vocabulary(attribute)
-            if vocabulary_exists and self.OOV_token in self._get_attribute_vocabulary(
-                attribute
-            ):
+            attribute_vocab = self._get_attribute_vocabulary(attribute)
+            if attribute_vocab is not None and self.OOV_token in attribute_vocab:
                 # CountVectorizer is trained, process for prediction
+                attribute_vocabulary_tokens = set(attribute_vocab.keys())
                 tokens = [
-                    t
-                    if t in self._get_attribute_vocabulary_tokens(attribute)
-                    else self.OOV_token
+                    t if t in attribute_vocabulary_tokens else self.OOV_token
                     for t in tokens
                 ]
             elif self.OOV_words:
@@ -607,9 +586,8 @@ def _get_featurized_attribute(
     ) -> Tuple[
         List[Optional[scipy.sparse.spmatrix]], List[Optional[scipy.sparse.spmatrix]]
     ]:
-        """Return features of a particular attribute for complete data"""
-
-        if self._check_attribute_vocabulary(attribute):
+        """Returns features of a particular attribute for complete data."""
+        if self._get_attribute_vocabulary(attribute) is not None:
             # count vectorizer was trained
             return self._create_features(attribute, all_tokens)
         else:
@@ -828,7 +806,7 @@ def load(
         **kwargs: Any,
     ) -> "CountVectorsFeaturizer":
         """Loads trained component (see parent class for full docstring)."""
-        file_name = meta.get("file")
+        file_name = meta["file"]
         featurizer_file = os.path.join(model_dir, file_name)
 
         if not os.path.exists(featurizer_file):
diff --git a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
index d09d040903fd..a1a172eb30dd 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/lexical_syntactic_featurizer.py
@@ -292,7 +292,7 @@ def load(
         **kwargs: Any,
     ) -> "LexicalSyntacticFeaturizer":
         """Loads trained component (see parent class for full docstring)."""
-        file_name = meta.get("file")
+        file_name = meta["file"]
 
         feature_to_idx_file = Path(model_dir) / f"{file_name}.feature_to_idx_dict.pkl"
         feature_to_idx_dict = io_utils.json_unpickle(feature_to_idx_file)
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index d0df42e75c8e..f7e2d2260608 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -248,7 +248,7 @@ def load(
                 finetuning.
             **kwargs: Any other arguments.
         """
-        file_name = meta.get("file")
+        file_name = meta["file"]
 
         patterns_file_name = Path(model_dir) / (file_name + ".patterns.pkl")
 
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 472b0c61f8f2..396d5af7aff5 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -34,15 +34,12 @@
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.selectors.response_selector import ResponseSelector
-from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.utils.spacy_utils import SpacyNLP
-from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.shared.exceptions import RasaException
 import rasa.shared.utils.common
 import rasa.shared.utils.io
@@ -61,14 +58,11 @@
     # utils
     SpacyNLP,
     MitieNLP,
-    HFTransformersNLP,
     # tokenizers
     MitieTokenizer,
     SpacyTokenizer,
     WhitespaceTokenizer,
-    ConveRTTokenizer,
     JiebaTokenizer,
-    LanguageModelTokenizer,
     # extractors
     SpacyEntityExtractor,
     MitieEntityExtractor,
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index bfa465929375..dca83f987f2c 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -146,8 +146,9 @@ def required_components(cls) -> List[Type[Component]]:
         KEY_RELATIVE_ATTENTION: False,
         # If 'True' use key relative embeddings in attention
         VALUE_RELATIVE_ATTENTION: False,
-        # Max position for relative embeddings
-        MAX_RELATIVE_POSITION: None,
+        # Max position for relative embeddings. Only in effect if key- or value relative
+        # attention are turned on
+        MAX_RELATIVE_POSITION: 5,
         # Use a unidirectional or bidirectional encoder.
         UNIDIRECTIONAL_ENCODER: False,
         # ## Training parameters
diff --git a/rasa/nlu/tokenizers/convert_tokenizer.py b/rasa/nlu/tokenizers/convert_tokenizer.py
deleted file mode 100644
index 369753791960..000000000000
--- a/rasa/nlu/tokenizers/convert_tokenizer.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import Dict, Text, Any
-
-import rasa.shared.utils.io
-from rasa.nlu.tokenizers.tokenizer import Tokenizer
-from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-
-
-class ConveRTTokenizer(WhitespaceTokenizer):
-    """This tokenizer is deprecated and will be removed in the future.
-
-    The ConveRTFeaturizer component now sets the sub-token information
-    for dense featurizable attributes of each message object.
-    """
-
-    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
-        """Initializes ConveRTTokenizer with the ConveRT model.
-
-        Args:
-            component_config: Configuration for the component.
-        """
-        super().__init__(component_config)
-        rasa.shared.utils.io.raise_warning(
-            f"'{self.__class__.__name__}' is deprecated and "
-            f"will be removed in the future. "
-            f"It is recommended to use the '{WhitespaceTokenizer.__name__}' or "
-            f"another {Tokenizer.__name__} instead.",
-            category=DeprecationWarning,
-        )
diff --git a/rasa/nlu/tokenizers/lm_tokenizer.py b/rasa/nlu/tokenizers/lm_tokenizer.py
deleted file mode 100644
index fbee73158ef1..000000000000
--- a/rasa/nlu/tokenizers/lm_tokenizer.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from typing import Dict, Text, Any
-
-import rasa.shared.utils.io
-from rasa.nlu.tokenizers.tokenizer import Tokenizer
-from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-
-
-class LanguageModelTokenizer(WhitespaceTokenizer):
-    """This tokenizer is deprecated and will be removed in the future.
-
-    Use the LanguageModelFeaturizer with any other Tokenizer instead.
-    """
-
-    def __init__(self, component_config: Dict[Text, Any] = None) -> None:
-        """Initializes LanguageModelTokenizer for tokenization.
-
-        Args:
-            component_config: Configuration for the component.
-        """
-        super().__init__(component_config)
-        rasa.shared.utils.io.raise_warning(
-            f"'{self.__class__.__name__}' is deprecated and "
-            f"will be removed in the future. "
-            f"It is recommended to use the '{WhitespaceTokenizer.__name__}' or "
-            f"another {Tokenizer.__name__} instead.",
-            category=DeprecationWarning,
-        )
diff --git a/rasa/nlu/utils/hugging_face/hf_transformers.py b/rasa/nlu/utils/hugging_face/hf_transformers.py
deleted file mode 100644
index 2864b25797d1..000000000000
--- a/rasa/nlu/utils/hugging_face/hf_transformers.py
+++ /dev/null
@@ -1,752 +0,0 @@
-import logging
-from typing import Any, Dict, List, Text, Tuple, Optional
-
-import rasa.core.utils
-from rasa.nlu.model import Metadata
-from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
-from rasa.nlu.components import Component
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.shared.nlu.training_data.training_data import TrainingData
-from rasa.shared.nlu.training_data.message import Message
-import rasa.shared.utils.io
-from rasa.nlu.tokenizers.tokenizer import Token
-import rasa.utils.train_utils as train_utils
-import numpy as np
-
-from rasa.nlu.constants import (
-    LANGUAGE_MODEL_DOCS,
-    DENSE_FEATURIZABLE_ATTRIBUTES,
-    SENTENCE_FEATURES,
-    SEQUENCE_FEATURES,
-    NUMBER_OF_SUB_TOKENS,
-    NO_LENGTH_RESTRICTION,
-)
-from rasa.shared.nlu.constants import TEXT, ACTION_TEXT
-
-MAX_SEQUENCE_LENGTHS = {
-    "bert": 512,
-    "gpt": 512,
-    "gpt2": 512,
-    "xlnet": NO_LENGTH_RESTRICTION,
-    "distilbert": 512,
-    "roberta": 512,
-}
-
-logger = logging.getLogger(__name__)
-
-
-class HFTransformersNLP(Component):
-    """This component is deprecated and will be removed in the future.
-
-    Use the LanguageModelFeaturizer instead.
-    """
-
-    defaults = {
-        # name of the language model to load.
-        "model_name": "bert",
-        # Pre-Trained weights to be loaded(string)
-        "model_weights": None,
-        # an optional path to a specific directory to download
-        # and cache the pre-trained model weights.
-        "cache_dir": None,
-    }
-
-    def __init__(
-        self,
-        component_config: Optional[Dict[Text, Any]] = None,
-        skip_model_load: bool = False,
-    ) -> None:
-        """Initializes HFTransformsNLP with the models specified."""
-        super(HFTransformersNLP, self).__init__(component_config)
-
-        self._load_model_metadata()
-        self._load_model_instance(skip_model_load)
-        self.whitespace_tokenizer = WhitespaceTokenizer()
-        rasa.shared.utils.io.raise_warning(
-            f"'{self.__class__.__name__}' is deprecated and "
-            f"will be removed in the future. "
-            f"It is recommended to use the '{LanguageModelFeaturizer.__name__}' "
-            f"instead.",
-            category=DeprecationWarning,
-        )
-
-    def _load_model_metadata(self) -> None:
-
-        from rasa.nlu.utils.hugging_face.registry import (
-            model_class_dict,
-            model_weights_defaults,
-        )
-
-        self.model_name = self.component_config["model_name"]
-
-        if self.model_name not in model_class_dict:
-            raise KeyError(
-                f"'{self.model_name}' not a valid model name. Choose from "
-                f"{str(list(model_class_dict.keys()))} or create "
-                f"a new class inheriting from this class to support your model."
-            )
-
-        self.model_weights = self.component_config["model_weights"]
-        self.cache_dir = self.component_config["cache_dir"]
-
-        if not self.model_weights:
-            logger.info(
-                f"Model weights not specified. Will choose default model weights: "
-                f"{model_weights_defaults[self.model_name]}"
-            )
-            self.model_weights = model_weights_defaults[self.model_name]
-
-        self.max_model_sequence_length = MAX_SEQUENCE_LENGTHS[self.model_name]
-
-    def _load_model_instance(self, skip_model_load: bool) -> None:
-        """Try loading the model instance.
-
-        Args:
-            skip_model_load: Skip loading the model instances to save time.
-            This should be True only for pytests
-        """
-        if skip_model_load:
-            # This should be True only during pytests
-            return
-
-        from rasa.nlu.utils.hugging_face.registry import (
-            model_class_dict,
-            model_tokenizer_dict,
-        )
-
-        logger.debug(f"Loading Tokenizer and Model for {self.model_name}")
-
-        self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
-            self.model_weights, cache_dir=self.cache_dir
-        )
-        self.model = model_class_dict[self.model_name].from_pretrained(
-            self.model_weights, cache_dir=self.cache_dir
-        )
-
-        # Use a universal pad token since all transformer architectures do not have a
-        # consistent token. Instead of pad_token_id we use unk_token_id because
-        # pad_token_id is not set for all architectures. We can't add a new token as
-        # well since vocabulary resizing is not yet supported for TF classes.
-        # Also, this does not hurt the model predictions since we use an attention mask
-        # while feeding input.
-        self.pad_token_id = self.tokenizer.unk_token_id
-
-    @classmethod
-    def cache_key(
-        cls, component_meta: Dict[Text, Any], model_metadata: Metadata
-    ) -> Optional[Text]:
-        """Cache the component for future use.
-
-        Args:
-            component_meta: configuration for the component.
-            model_metadata: configuration for the whole pipeline.
-
-        Returns: key of the cache for future retrievals.
-        """
-        weights = component_meta.get("model_weights") or {}
-
-        return (
-            f"{cls.name}-{component_meta.get('model_name')}-"
-            f"{rasa.shared.utils.io.deep_container_fingerprint(weights)}"
-        )
-
-    @classmethod
-    def required_packages(cls) -> List[Text]:
-        return ["transformers"]
-
-    def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]:
-        """Pass the text through the tokenizer of the language model.
-
-        Args:
-            text: Text to be tokenized.
-
-        Returns:
-            List of token ids and token strings.
-
-        """
-        split_token_ids = self.tokenizer.encode(text, add_special_tokens=False)
-
-        split_token_strings = self.tokenizer.convert_ids_to_tokens(split_token_ids)
-
-        return split_token_ids, split_token_strings
-
-    def _add_lm_specific_special_tokens(
-        self, token_ids: List[List[int]]
-    ) -> List[List[int]]:
-        """Adds language model specific special tokens.
-
-         These tokens were used during their training.
-
-        Args:
-            token_ids: List of token ids for each example in the batch.
-
-        Returns:
-            Augmented list of token ids for each example in the batch.
-        """
-        from rasa.nlu.utils.hugging_face.registry import (
-            model_special_tokens_pre_processors,
-        )
-
-        augmented_tokens = [
-            model_special_tokens_pre_processors[self.model_name](example_token_ids)
-            for example_token_ids in token_ids
-        ]
-        return augmented_tokens
-
-    def _lm_specific_token_cleanup(
-        self, split_token_ids: List[int], token_strings: List[Text]
-    ) -> Tuple[List[int], List[Text]]:
-        """Clean up special chars added by tokenizers of language models.
-
-        Many language models add a special char in front/back of (some) words. We clean
-        up those chars as they are not
-        needed once the features are already computed.
-
-        Args:
-            split_token_ids: List of token ids received as output from the language
-            model specific tokenizer.
-            token_strings: List of token strings received as output from the language
-            model specific tokenizer.
-
-        Returns:
-            Cleaned up token ids and token strings.
-        """
-        from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners
-
-        return model_tokens_cleaners[self.model_name](split_token_ids, token_strings)
-
-    def _post_process_sequence_embeddings(
-        self, sequence_embeddings: np.ndarray
-    ) -> Tuple[np.ndarray, np.ndarray]:
-        """Compute sentence level representations and sequence level representations
-        for relevant tokens.
-
-        Args:
-            sequence_embeddings: Sequence level dense features received as output from
-            language model.
-
-        Returns:
-            Sentence and sequence level representations.
-        """
-        from rasa.nlu.utils.hugging_face.registry import (
-            model_embeddings_post_processors,
-        )
-
-        sentence_embeddings = []
-        post_processed_sequence_embeddings = []
-
-        for example_embedding in sequence_embeddings:
-            (
-                example_sentence_embedding,
-                example_post_processed_embedding,
-            ) = model_embeddings_post_processors[self.model_name](example_embedding)
-
-            sentence_embeddings.append(example_sentence_embedding)
-            post_processed_sequence_embeddings.append(example_post_processed_embedding)
-
-        return (
-            np.array(sentence_embeddings),
-            np.array(post_processed_sequence_embeddings),
-        )
-
-    def _tokenize_example(
-        self, message: Message, attribute: Text
-    ) -> Tuple[List[Token], List[int]]:
-        """Tokenize a single message example.
-
-        Many language models add a special char in front of (some) words and split
-        words into sub-words. To ensure the entity start and end values matches the
-        token values, tokenize the text first using the whitespace tokenizer. If
-        individual tokens are split up into multiple tokens, we add this information
-        to the respected token.
-
-        Args:
-            message: Single message object to be processed.
-            attribute: Property of message to be processed, one of ``TEXT`` or
-            ``RESPONSE``.
-
-        Returns:
-            List of token strings and token ids for the corresponding attribute of the
-            message.
-        """
-        tokens_in = self.whitespace_tokenizer.tokenize(message, attribute)
-
-        tokens_out = []
-
-        token_ids_out = []
-
-        for token in tokens_in:
-            # use lm specific tokenizer to further tokenize the text
-            split_token_ids, split_token_strings = self._lm_tokenize(token.text)
-
-            split_token_ids, split_token_strings = self._lm_specific_token_cleanup(
-                split_token_ids, split_token_strings
-            )
-
-            token_ids_out += split_token_ids
-
-            token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings))
-
-            tokens_out.append(token)
-
-        return tokens_out, token_ids_out
-
-    def _get_token_ids_for_batch(
-        self, batch_examples: List[Message], attribute: Text
-    ) -> Tuple[List[List[Token]], List[List[int]]]:
-        """Compute token ids and token strings for each example in batch.
-
-        A token id is the id of that token in the vocabulary of the language model.
-        Args:
-            batch_examples: Batch of message objects for which tokens need to be
-            computed.
-            attribute: Property of message to be processed, one of ``TEXT`` or
-            ``RESPONSE``.
-
-        Returns:
-            List of token strings and token ids for each example in the batch.
-        """
-        batch_token_ids = []
-        batch_tokens = []
-        for example in batch_examples:
-
-            example_tokens, example_token_ids = self._tokenize_example(
-                example, attribute
-            )
-            batch_tokens.append(example_tokens)
-            batch_token_ids.append(example_token_ids)
-
-        return batch_tokens, batch_token_ids
-
-    @staticmethod
-    def _compute_attention_mask(
-        actual_sequence_lengths: List[int], max_input_sequence_length: int
-    ) -> np.ndarray:
-        """Compute a mask for padding tokens.
-
-        This mask will be used by the language model so that it does not attend to
-        padding tokens.
-
-        Args:
-            actual_sequence_lengths: List of length of each example without any padding.
-            max_input_sequence_length: Maximum length of a sequence that will be
-                present in the input batch. This is
-            after taking into consideration the maximum input sequence the model can
-                handle. Hence it can never be
-            greater than self.max_model_sequence_length in case the model applies
-                length restriction.
-
-        Returns:
-            Computed attention mask, 0 for padding and 1 for non-padding tokens.
-        """
-        attention_mask = []
-
-        for actual_sequence_length in actual_sequence_lengths:
-            # add 1s for present tokens, fill up the remaining space up to max
-            # sequence length with 0s (non-existing tokens)
-            padded_sequence = [1] * min(
-                actual_sequence_length, max_input_sequence_length
-            ) + [0] * (
-                max_input_sequence_length
-                - min(actual_sequence_length, max_input_sequence_length)
-            )
-            attention_mask.append(padded_sequence)
-
-        attention_mask = np.array(attention_mask).astype(np.float32)
-        return attention_mask
-
-    def _extract_sequence_lengths(
-        self, batch_token_ids: List[List[int]]
-    ) -> Tuple[List[int], int]:
-        """Extracts the sequence length for each example and maximum sequence length.
-
-        Args:
-            batch_token_ids: List of token ids for each example in the batch.
-
-        Returns:
-            Tuple consisting of: the actual sequence lengths for each example,
-            and the maximum input sequence length (taking into account the
-            maximum sequence length that the model can handle.
-        """
-        # Compute max length across examples
-        max_input_sequence_length = 0
-        actual_sequence_lengths = []
-
-        for example_token_ids in batch_token_ids:
-            sequence_length = len(example_token_ids)
-            actual_sequence_lengths.append(sequence_length)
-            max_input_sequence_length = max(
-                max_input_sequence_length, len(example_token_ids)
-            )
-
-        # Take into account the maximum sequence length the model can handle
-        max_input_sequence_length = (
-            max_input_sequence_length
-            if self.max_model_sequence_length == NO_LENGTH_RESTRICTION
-            else min(max_input_sequence_length, self.max_model_sequence_length)
-        )
-
-        return actual_sequence_lengths, max_input_sequence_length
-
-    def _add_padding_to_batch(
-        self, batch_token_ids: List[List[int]], max_sequence_length_model: int
-    ) -> List[List[int]]:
-        """Add padding so that all examples in the batch are of the same length.
-
-        Args:
-            batch_token_ids: Batch of examples where each example is a non-padded list
-            of token ids.
-            max_sequence_length_model: Maximum length of any input sequence in the batch
-            to be fed to the model.
-
-        Returns:
-            Padded batch with all examples of the same length.
-        """
-        padded_token_ids = []
-
-        # Add padding according to max_sequence_length
-        # Some models don't contain pad token, we use unknown token as padding token.
-        # This doesn't affect the computation since we compute an attention mask
-        # anyways.
-        for example_token_ids in batch_token_ids:
-
-            # Truncate any longer sequences so that they can be fed to the model
-            if len(example_token_ids) > max_sequence_length_model:
-                example_token_ids = example_token_ids[:max_sequence_length_model]
-
-            padded_token_ids.append(
-                example_token_ids
-                + [self.pad_token_id]
-                * (max_sequence_length_model - len(example_token_ids))
-            )
-        return padded_token_ids
-
-    @staticmethod
-    def _extract_nonpadded_embeddings(
-        embeddings: np.ndarray, actual_sequence_lengths: List[int]
-    ) -> np.ndarray:
-        """Use pre-computed non-padded lengths of each example to extract embeddings
-        for non-padding tokens.
-
-        Args:
-            embeddings: sequence level representations for each example of the batch.
-            actual_sequence_lengths: non-padded lengths of each example of the batch.
-
-        Returns:
-            Sequence level embeddings for only non-padding tokens of the batch.
-        """
-        nonpadded_sequence_embeddings = []
-        for index, embedding in enumerate(embeddings):
-            unmasked_embedding = embedding[: actual_sequence_lengths[index]]
-            nonpadded_sequence_embeddings.append(unmasked_embedding)
-
-        return np.array(nonpadded_sequence_embeddings)
-
-    def _compute_batch_sequence_features(
-        self, batch_attention_mask: np.ndarray, padded_token_ids: List[List[int]]
-    ) -> np.ndarray:
-        """Feed the padded batch to the language model.
-
-        Args:
-            batch_attention_mask: Mask of 0s and 1s which indicate whether the token
-            is a padding token or not.
-            padded_token_ids: Batch of token ids for each example. The batch is padded
-            and hence can be fed at once.
-
-        Returns:
-            Sequence level representations from the language model.
-        """
-        model_outputs = self.model(
-            np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask)
-        )
-
-        # sequence hidden states is always the first output from all models
-        sequence_hidden_states = model_outputs[0]
-
-        sequence_hidden_states = sequence_hidden_states.numpy()
-        return sequence_hidden_states
-
-    def _validate_sequence_lengths(
-        self,
-        actual_sequence_lengths: List[int],
-        batch_examples: List[Message],
-        attribute: Text,
-        inference_mode: bool = False,
-    ) -> None:
-        """Validate if sequence lengths of all inputs are less the max sequence length.
-
-        This method should throw an error during training, whereas log a debug message
-        during inference if any of the input examples have a length greater than
-        maximum sequence length allowed.
-
-        Args:
-            actual_sequence_lengths: original sequence length of all inputs
-            batch_examples: all message instances in the batch
-            attribute: attribute of message object to be processed
-            inference_mode: Whether this is during training or during inferencing
-        """
-        if self.max_model_sequence_length == NO_LENGTH_RESTRICTION:
-            # There is no restriction on sequence length from the model
-            return
-
-        for sequence_length, example in zip(actual_sequence_lengths, batch_examples):
-            if sequence_length > self.max_model_sequence_length:
-                if not inference_mode:
-                    raise RuntimeError(
-                        f"The sequence length of '{example.get(attribute)[:20]}...' "
-                        f"is too long({sequence_length} tokens) for the "
-                        f"model chosen {self.model_name} which has a maximum "
-                        f"sequence length of {self.max_model_sequence_length} tokens. "
-                        f"Either shorten the message or use a model which has no "
-                        f"restriction on input sequence length like XLNet."
-                    )
-                else:
-                    logger.debug(
-                        f"The sequence length of '{example.get(attribute)[:20]}...' "
-                        f"is too long({sequence_length} tokens) for the "
-                        f"model chosen {self.model_name} which has a maximum "
-                        f"sequence length of {self.max_model_sequence_length} tokens. "
-                        f"Downstream model predictions may be affected because of this."
-                    )
-
-    def _add_extra_padding(
-        self, sequence_embeddings: np.ndarray, actual_sequence_lengths: List[int]
-    ) -> np.ndarray:
-        """Adds extra zero padding to match the original sequence length.
-
-        This is only done if the input was truncated during the batch preparation of
-        input for the model.
-
-        Args:
-            sequence_embeddings: Embeddings returned from the model
-            actual_sequence_lengths: original sequence length of all inputs
-
-        Returns:
-            Modified sequence embeddings with padding if necessary
-        """
-        if self.max_model_sequence_length == NO_LENGTH_RESTRICTION:
-            # No extra padding needed because there wouldn't have been any truncation
-            # in the first place
-            return sequence_embeddings
-
-        reshaped_sequence_embeddings = []
-        for index, embedding in enumerate(sequence_embeddings):
-            embedding_size = embedding.shape[-1]
-            if actual_sequence_lengths[index] > self.max_model_sequence_length:
-                embedding = np.concatenate(
-                    [
-                        embedding,
-                        np.zeros(
-                            (
-                                actual_sequence_lengths[index]
-                                - self.max_model_sequence_length,
-                                embedding_size,
-                            ),
-                            dtype=np.float32,
-                        ),
-                    ]
-                )
-            reshaped_sequence_embeddings.append(embedding)
-
-        return np.array(reshaped_sequence_embeddings)
-
-    def _get_model_features_for_batch(
-        self,
-        batch_token_ids: List[List[int]],
-        batch_tokens: List[List[Token]],
-        batch_examples: List[Message],
-        attribute: Text,
-        inference_mode: bool = False,
-    ) -> Tuple[np.ndarray, np.ndarray]:
-        """Compute dense features of each example in the batch.
-
-        We first add the special tokens corresponding to each language model. Next, we
-        add appropriate padding and compute a mask for that padding so that it doesn't
-        affect the feature computation. The padded batch is next fed to the language
-        model and token level embeddings are computed. Using the pre-computed mask,
-        embeddings for non-padding tokens are extracted and subsequently sentence
-        level embeddings are computed.
-
-        Args:
-            batch_token_ids: List of token ids of each example in the batch.
-            batch_tokens: List of token objects for each example in the batch.
-            batch_examples: List of examples in the batch.
-            attribute: attribute of the Message object to be processed.
-            inference_mode: Whether the call is during training or during inference.
-
-        Returns:
-            Sentence and token level dense representations.
-        """
-        # Let's first add tokenizer specific special tokens to all examples
-        batch_token_ids_augmented = self._add_lm_specific_special_tokens(
-            batch_token_ids
-        )
-
-        # Compute sequence lengths for all examples
-        (
-            actual_sequence_lengths,
-            max_input_sequence_length,
-        ) = self._extract_sequence_lengths(batch_token_ids_augmented)
-
-        # Validate that all sequences can be processed based on their sequence lengths
-        # and the maximum sequence length the model can handle
-        self._validate_sequence_lengths(
-            actual_sequence_lengths, batch_examples, attribute, inference_mode
-        )
-
-        # Add padding so that whole batch can be fed to the model
-        padded_token_ids = self._add_padding_to_batch(
-            batch_token_ids_augmented, max_input_sequence_length
-        )
-
-        # Compute attention mask based on actual_sequence_length
-        batch_attention_mask = self._compute_attention_mask(
-            actual_sequence_lengths, max_input_sequence_length
-        )
-
-        # Get token level features from the model
-        sequence_hidden_states = self._compute_batch_sequence_features(
-            batch_attention_mask, padded_token_ids
-        )
-
-        # Extract features for only non-padding tokens
-        sequence_nonpadded_embeddings = self._extract_nonpadded_embeddings(
-            sequence_hidden_states, actual_sequence_lengths
-        )
-
-        # Extract sentence level and post-processed features
-        (
-            sentence_embeddings,
-            sequence_embeddings,
-        ) = self._post_process_sequence_embeddings(sequence_nonpadded_embeddings)
-
-        # Pad zeros for examples which were truncated in inference mode.
-        # This is intentionally done after sentence embeddings have been extracted so
-        # that they are not affected
-        sequence_embeddings = self._add_extra_padding(
-            sequence_embeddings, actual_sequence_lengths
-        )
-
-        # shape of matrix for all sequence embeddings
-        batch_dim = len(sequence_embeddings)
-        seq_dim = max(e.shape[0] for e in sequence_embeddings)
-        feature_dim = sequence_embeddings[0].shape[1]
-        shape = (batch_dim, seq_dim, feature_dim)
-
-        # align features with tokens so that we have just one vector per token
-        # (don't include sub-tokens)
-        sequence_embeddings = train_utils.align_token_features(
-            batch_tokens, sequence_embeddings, shape
-        )
-
-        # sequence_embeddings is a padded numpy array
-        # remove the padding, keep just the non-zero vectors
-        sequence_final_embeddings = []
-        for embeddings, tokens in zip(sequence_embeddings, batch_tokens):
-            sequence_final_embeddings.append(embeddings[: len(tokens)])
-        sequence_final_embeddings = np.array(sequence_final_embeddings)
-
-        return sentence_embeddings, sequence_final_embeddings
-
-    def _get_docs_for_batch(
-        self,
-        batch_examples: List[Message],
-        attribute: Text,
-        inference_mode: bool = False,
-    ) -> List[Dict[Text, Any]]:
-        """Compute language model docs for all examples in the batch.
-
-        Args:
-            batch_examples: Batch of message objects for which language model docs
-            need to be computed.
-            attribute: Property of message to be processed, one of ``TEXT`` or
-            ``RESPONSE``.
-            inference_mode: Whether the call is during inference or during training.
-
-
-        Returns:
-            List of language model docs for each message in batch.
-        """
-        batch_tokens, batch_token_ids = self._get_token_ids_for_batch(
-            batch_examples, attribute
-        )
-
-        (
-            batch_sentence_features,
-            batch_sequence_features,
-        ) = self._get_model_features_for_batch(
-            batch_token_ids, batch_tokens, batch_examples, attribute, inference_mode
-        )
-
-        # A doc consists of
-        # {'token_ids': ..., 'tokens': ..., 'sequence_features': ...,
-        # 'sentence_features': ...}
-        batch_docs = []
-        for index in range(len(batch_examples)):
-            doc = {
-                SEQUENCE_FEATURES: batch_sequence_features[index],
-                SENTENCE_FEATURES: np.reshape(batch_sentence_features[index], (1, -1)),
-            }
-            batch_docs.append(doc)
-
-        return batch_docs
-
-    def train(
-        self,
-        training_data: TrainingData,
-        config: Optional[RasaNLUModelConfig] = None,
-        **kwargs: Any,
-    ) -> None:
-        """Compute tokens and dense features for each message in training data.
-
-        Args:
-            training_data: NLU training data to be tokenized and featurized
-            config: NLU pipeline config consisting of all components.
-
-        """
-        batch_size = 64
-
-        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
-
-            non_empty_examples = list(
-                filter(lambda x: x.get(attribute), training_data.training_examples)
-            )
-
-            batch_start_index = 0
-
-            while batch_start_index < len(non_empty_examples):
-
-                batch_end_index = min(
-                    batch_start_index + batch_size, len(non_empty_examples)
-                )
-                # Collect batch examples
-                batch_messages = non_empty_examples[batch_start_index:batch_end_index]
-
-                # Construct a doc with relevant features
-                # extracted(tokens, dense_features)
-                batch_docs = self._get_docs_for_batch(batch_messages, attribute)
-
-                for index, ex in enumerate(batch_messages):
-
-                    ex.set(LANGUAGE_MODEL_DOCS[attribute], batch_docs[index])
-
-                batch_start_index += batch_size
-
-    def process(self, message: Message, **kwargs: Any) -> None:
-        """Process an incoming message by computing its tokens and dense features.
-
-        Args:
-            message: Incoming message object
-        """
-        # process of all featurizers operates only on TEXT and ACTION_TEXT attributes,
-        # because all other attributes are labels which are featurized during training
-        # and their features are stored by the model itself.
-        for attribute in {TEXT, ACTION_TEXT}:
-            if message.get(attribute):
-                message.set(
-                    LANGUAGE_MODEL_DOCS[attribute],
-                    self._get_docs_for_batch(
-                        [message], attribute=attribute, inference_mode=True
-                    )[0],
-                )
diff --git a/rasa/nlu/utils/mitie_utils.py b/rasa/nlu/utils/mitie_utils.py
index 110bccb6e21b..691564c0168a 100644
--- a/rasa/nlu/utils/mitie_utils.py
+++ b/rasa/nlu/utils/mitie_utils.py
@@ -63,9 +63,9 @@ def cache_key(
         cls, component_meta: Dict[Text, Any], model_metadata: "Metadata"
     ) -> Optional[Text]:
 
-        mitie_file = component_meta.get("model", None)
+        mitie_file = component_meta.get("model")
         if mitie_file is not None:
-            return cls.name + "-" + str(os.path.abspath(mitie_file))
+            return f"{cls.name}-{os.path.abspath(mitie_file)}"
         else:
             return None
 
diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
index f2434f1eab57..21eaea8aebc7 100644
--- a/rasa/nlu/utils/spacy_utils.py
+++ b/rasa/nlu/utils/spacy_utils.py
@@ -102,7 +102,7 @@ def cache_key(
                 f"More informaton can be found on {DOCS_URL_COMPONENTS}#spacynlp"
             )
 
-        return cls.name + "-" + spacy_model_name
+        return f"{cls.name}-{spacy_model_name}"
 
     def provide_context(self) -> Dict[Text, Any]:
         """Creates a context dictionary from spaCy nlp object."""
@@ -266,7 +266,7 @@ def load(
         if cached_component:
             return cached_component
 
-        nlp = cls.load_model(meta.get("model"))
+        nlp = cls.load_model(meta["model"])
         cls.ensure_proper_language_model(nlp)
         return cls(meta, nlp)
 
diff --git a/rasa/shared/core/events.py b/rasa/shared/core/events.py
index 7d5213e3ff9b..a546dc06614d 100644
--- a/rasa/shared/core/events.py
+++ b/rasa/shared/core/events.py
@@ -247,19 +247,7 @@ def __init__(
         metadata: Optional[Dict[Text, Any]] = None,
     ) -> None:
         self.timestamp = timestamp or time.time()
-        self._metadata = metadata or {}
-
-    @property
-    def metadata(self) -> Dict[Text, Any]:
-        # Needed for compatibility with Rasa versions <1.4.0. Previous versions
-        # of Rasa serialized trackers using the pickle module. For the moment,
-        # Rasa still supports loading these serialized trackers with pickle,
-        # but will use JSON in any subsequent save operations. Versions of
-        # trackers serialized with pickle won't include the `_metadata`
-        # attribute in their events, so it is necessary to define this getter
-        # in case the attribute does not exist. For more information see
-        # CHANGELOG.rst.
-        return getattr(self, "_metadata", {})
+        self.metadata = metadata or {}
 
     def __ne__(self, other: Any) -> bool:
         # Not strictly necessary, but to avoid having both x==y and x!=y
diff --git a/rasa/shared/core/generator.py b/rasa/shared/core/generator.py
index 66c99e928778..deb08441ec28 100644
--- a/rasa/shared/core/generator.py
+++ b/rasa/shared/core/generator.py
@@ -642,7 +642,10 @@ def _process_step(
                 # we concatenate the story block names of the blocks that
                 # contribute to the trackers events
                 if tracker.sender_id:
-                    if step.block_name not in tracker.sender_id.split(" > "):
+                    if (
+                        step.block_name
+                        and step.block_name not in tracker.sender_id.split(" > ")
+                    ):
                         new_sender = tracker.sender_id + " > " + step.block_name
                     else:
                         new_sender = tracker.sender_id
diff --git a/rasa/utils/endpoints.py b/rasa/utils/endpoints.py
index 9e040188a692..588ccff5cadb 100644
--- a/rasa/utils/endpoints.py
+++ b/rasa/utils/endpoints.py
@@ -73,18 +73,19 @@ class EndpointConfig:
 
     def __init__(
         self,
-        url: Text = None,
-        params: Dict[Text, Any] = None,
-        headers: Dict[Text, Any] = None,
-        basic_auth: Dict[Text, Text] = None,
+        url: Optional[Text] = None,
+        params: Optional[Dict[Text, Any]] = None,
+        headers: Optional[Dict[Text, Any]] = None,
+        basic_auth: Optional[Dict[Text, Text]] = None,
         token: Optional[Text] = None,
         token_name: Text = "token",
         **kwargs: Any,
     ) -> None:
+        """Creates an `EndpointConfig` instance."""
         self.url = url
-        self.params = params if params else {}
-        self.headers = headers if headers else {}
-        self.basic_auth = basic_auth
+        self.params = params or {}
+        self.headers = headers or {}
+        self.basic_auth = basic_auth or {}
         self.token = token
         self.token_name = token_name
         self.type = kwargs.pop("store_type", kwargs.pop("type", None))
diff --git a/rasa/utils/tensorflow/data_generator.py b/rasa/utils/tensorflow/data_generator.py
index cc50f527d0d1..43493fc1934a 100644
--- a/rasa/utils/tensorflow/data_generator.py
+++ b/rasa/utils/tensorflow/data_generator.py
@@ -365,9 +365,9 @@ def __init__(
         # set current epoch to `-1`, so that `on_epoch_end` will increase it to `0`
         self._current_epoch = -1
         # actual batch size will be set inside `on_epoch_end`
-        self._current_batch_size = None
+        self._current_batch_size = 0
         # create separate data variable that will store modified data for each batch
-        self._data = None
+        self._data = {}
         self.on_epoch_end()
 
     def __len__(self) -> int:
diff --git a/rasa/utils/tensorflow/transformer.py b/rasa/utils/tensorflow/transformer.py
index d589fc6d2a54..1a9b561503cc 100644
--- a/rasa/utils/tensorflow/transformer.py
+++ b/rasa/utils/tensorflow/transformer.py
@@ -38,7 +38,7 @@ def __init__(
         unidirectional: bool = False,
         use_key_relative_position: bool = False,
         use_value_relative_position: bool = False,
-        max_relative_position: Optional[int] = None,
+        max_relative_position: int = 5,
         heads_share_relative_embedding: bool = False,
     ) -> None:
         super().__init__()
@@ -56,8 +56,7 @@ def __init__(
         self.use_key_relative_position = use_key_relative_position
         self.use_value_relative_position = use_value_relative_position
         self.relative_length = max_relative_position
-        if self.relative_length is not None:
-            self.relative_length += 1  # include current time
+        self.relative_length += 1  # include current time
         self.heads_share_relative_embedding = heads_share_relative_embedding
 
         self._depth = units // self.num_heads
@@ -414,7 +413,7 @@ def __init__(
         unidirectional: bool = False,
         use_key_relative_position: bool = False,
         use_value_relative_position: bool = False,
-        max_relative_position: Optional[int] = None,
+        max_relative_position: int = 5,
         heads_share_relative_embedding: bool = False,
     ) -> None:
         super().__init__()
@@ -521,7 +520,7 @@ def __init__(
         unidirectional: bool = False,
         use_key_relative_position: bool = False,
         use_value_relative_position: bool = False,
-        max_relative_position: Optional[int] = None,
+        max_relative_position: int = 5,
         heads_share_relative_embedding: bool = False,
         name: Optional[Text] = None,
     ) -> None:
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index dd60c1b88a37..dbc9ee5685fa 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -55,7 +55,7 @@
     from tensorflow.keras.callbacks import Callback
 
 
-def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
+def normalize(values: np.ndarray, ranking_length: int = 0) -> np.ndarray:
     """Normalizes an array of positive numbers over the top `ranking_length` values.
 
     Other values will be set to 0.
@@ -139,7 +139,7 @@ def align_token_features(
 ) -> np.ndarray:
     """Align token features to match tokens.
 
-    ConveRTTokenizer, LanguageModelTokenizers might split up tokens into sub-tokens.
+    ConveRTFeaturizer and LanguageModelFeaturizer might split up tokens into sub-tokens.
     We need to take the mean of the sub-token vectors and take that as token vector.
 
     Args:
diff --git a/setup.cfg b/setup.cfg
index 6b3e66735e56..22ed56bddf9b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -48,4 +48,4 @@ disallow_untyped_decorators = True
 	# see https://github.com/RasaHQ/rasa/pull/6470
 	# the list below is sorted by the number of errors for each error code, in decreasing order
 disable_error_code = arg-type, assignment, var-annotated, union-attr,
-    override, operator, attr-defined, misc
+    override, attr-defined, misc
diff --git a/tests/conftest.py b/tests/conftest.py
index 4c368522debc..94576159a0cd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -74,20 +74,6 @@
 }
 
 
-TEST_DIALOGUES = [
-    "data/test_dialogues/default.json",
-    "data/test_dialogues/formbot.json",
-    "data/test_dialogues/moodbot.json",
-]
-
-EXAMPLE_DOMAINS = [
-    "data/test_domains/default_with_mapping.yml",
-    "data/test_domains/default_with_slots.yml",
-    "examples/formbot/domain.yml",
-    "data/test_moodbot/domain.yml",
-]
-
-
 @pytest.fixture(scope="session")
 def nlu_as_json_path() -> Text:
     return "data/examples/rasa/demo-rasa.json"
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
index 1fd232410137..06495d89aecf 100644
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
@@ -25,7 +25,8 @@
 from rasa.shared.core.trackers import DialogueStateTracker
 from rasa.shared.nlu.training_data.features import Features
 from rasa.shared.nlu.constants import INTENT, ACTION_NAME, FEATURE_TYPE_SENTENCE
-from tests.core.utilities import tracker_from_dialogue_file
+from tests.dialogues import TEST_MOODBOT_DIALOGUE
+from tests.core.utilities import tracker_from_dialogue
 
 
 class CustomSlot(Slot):
@@ -193,6 +194,4 @@ def moodbot_features(
 
 @pytest.fixture
 def moodbot_tracker(moodbot_domain: Domain) -> DialogueStateTracker:
-    return tracker_from_dialogue_file(
-        "data/test_dialogues/moodbot.json", moodbot_domain
-    )
+    return tracker_from_dialogue(TEST_MOODBOT_DIALOGUE, moodbot_domain)
diff --git a/tests/core/test_broker.py b/tests/core/test_broker.py
index 81814e31aa91..2c86a8ace81e 100644
--- a/tests/core/test_broker.py
+++ b/tests/core/test_broker.py
@@ -262,6 +262,7 @@ async def test_kafka_broker_from_config():
         topic="topic",
         partition_by_sender=True,
         security_protocol="SASL_PLAINTEXT",
+        convert_intent_id_to_string=True,
     )
 
     assert actual.url == expected.url
@@ -270,6 +271,54 @@ async def test_kafka_broker_from_config():
     assert actual.sasl_mechanism == expected.sasl_mechanism
     assert actual.topic == expected.topic
     assert actual.partition_by_sender == expected.partition_by_sender
+    assert actual.convert_intent_id_to_string == expected.convert_intent_id_to_string
+
+
+async def test_kafka_broker_convert_intent_id_to_string():
+    user_event = {
+        "timestamp": 1517821726.200036,
+        "metadata": {},
+        "parse_data": {
+            "entities": [],
+            "intent": {"confidence": 0.54, "name": "greet", "id": 7703045398849936579},
+            "message_id": "987654321",
+            "metadata": {},
+            "text": "/greet",
+            "intent_ranking": [
+                {"confidence": 0.54, "name": "greet", "id": 7703045398849936579},
+                {"confidence": 0.31, "name": "goodbye", "id": -5127945386715371244},
+                {"confidence": 0.15, "name": "default", "id": 1699173715362944540},
+            ],
+        },
+        "event": "user",
+        "text": "/greet",
+        "input_channel": "rest",
+        "message_id": "987654321",
+    }
+    actual = KafkaEventBroker(
+        "localhost",
+        sasl_username="username",
+        sasl_password="password",
+        sasl_mechanism="PLAIN",
+        topic="topic",
+        partition_by_sender=True,
+        security_protocol="SASL_PLAINTEXT",
+        convert_intent_id_to_string=True,
+    )
+
+    converted_user_event = actual._convert_intent_id_to_string(user_event)
+    intent_ranking = user_event["parse_data"]["intent_ranking"]
+    converted_intent_ranking = converted_user_event["parse_data"]["intent_ranking"]
+
+    assert converted_user_event["parse_data"]["intent"]["id"] == str(
+        user_event["parse_data"]["intent"]["id"]
+    )
+    assert all(
+        converted_parse_data["id"] == str(parse_data["id"])
+        for parse_data, converted_parse_data in zip(
+            intent_ranking, converted_intent_ranking
+        )
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index cff5fb22954a..118470432ad0 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -31,7 +31,8 @@
 from rasa.core.policies.ted_policy import TEDPolicy
 from rasa.core.policies.memoization import AugmentedMemoizationPolicy, MemoizationPolicy
 from rasa.shared.core.trackers import DialogueStateTracker
-from tests.core.utilities import get_tracker, read_dialogue_file
+from tests.dialogues import TEST_DEFAULT_DIALOGUE
+from tests.core.utilities import get_tracker, tracker_from_dialogue
 
 
 async def train_trackers(
@@ -229,11 +230,7 @@ async def test_memorise(
     def test_memorise_with_nlu(
         self, trained_policy: MemoizationPolicy, default_domain: Domain
     ):
-        filename = "data/test_dialogues/default.json"
-        dialogue = read_dialogue_file(filename)
-
-        tracker = DialogueStateTracker(dialogue.name, default_domain.slots)
-        tracker.recreate_from_dialogue(dialogue)
+        tracker = tracker_from_dialogue(TEST_DEFAULT_DIALOGUE, default_domain)
         states = trained_policy._prediction_states(tracker, default_domain)
 
         recalled = trained_policy.recall(states, tracker, default_domain)
diff --git a/tests/core/test_processor.py b/tests/core/test_processor.py
index 7008e677b5c9..835f23ff0d68 100644
--- a/tests/core/test_processor.py
+++ b/tests/core/test_processor.py
@@ -29,6 +29,7 @@
     UserMessage,
     OutputChannel,
 )
+from rasa.exceptions import ActionLimitReached
 from rasa.shared.core.domain import SessionConfig, Domain, KEY_ACTIONS
 from rasa.shared.core.events import (
     ActionExecuted,
@@ -1297,3 +1298,33 @@ def test_predict_next_action_with_hidden_rules():
     action, prediction = processor.predict_next_action(tracker)
     assert isinstance(action, ActionListen)
     assert not prediction.hide_rule_turn
+
+
+def test_predict_next_action_raises_limit_reached_exception(domain: Domain):
+    interpreter = RegexInterpreter()
+    ensemble = SimplePolicyEnsemble(policies=[RulePolicy(), MemoizationPolicy()])
+    tracker_store = InMemoryTrackerStore(domain)
+    lock_store = InMemoryLockStore()
+
+    processor = MessageProcessor(
+        interpreter,
+        ensemble,
+        domain,
+        tracker_store,
+        lock_store,
+        TemplatedNaturalLanguageGenerator(domain.responses),
+        max_number_of_predictions=1,
+    )
+
+    tracker = DialogueStateTracker.from_events(
+        "test",
+        evts=[
+            ActionExecuted(ACTION_LISTEN_NAME),
+            UserUttered("Hi!"),
+            ActionExecuted("test_action"),
+        ],
+    )
+    tracker.set_latest_action({"action_name": "test_action"})
+
+    with pytest.raises(ActionLimitReached):
+        processor.predict_next_action(tracker)
diff --git a/tests/core/utilities.py b/tests/core/utilities.py
index c3a81579cae5..64f225effcba 100644
--- a/tests/core/utilities.py
+++ b/tests/core/utilities.py
@@ -1,14 +1,9 @@
 import itertools
-
 import contextlib
+import os
 import typing
 from typing import List, Optional, Text, Any, Dict
 
-import jsonpickle
-import os
-
-import rasa.shared.utils.io
-import rasa.utils.io
 from rasa.shared.core.domain import Domain
 from rasa.shared.core.events import UserUttered, Event
 from rasa.shared.core.trackers import DialogueStateTracker
@@ -18,20 +13,12 @@
     from rasa.shared.core.conversation import Dialogue
 
 
-def tracker_from_dialogue_file(
-    filename: Text, domain: Optional[Domain] = None
-) -> DialogueStateTracker:
-    dialogue = read_dialogue_file(filename)
-
+def tracker_from_dialogue(dialogue: "Dialogue", domain: Domain) -> DialogueStateTracker:
     tracker = DialogueStateTracker(dialogue.name, domain.slots)
     tracker.recreate_from_dialogue(dialogue)
     return tracker
 
 
-def read_dialogue_file(filename: Text) -> "Dialogue":
-    return jsonpickle.loads(rasa.shared.utils.io.read_file(filename))
-
-
 @contextlib.contextmanager
 def cwd(path: Text):
     CWD = os.getcwd()
diff --git a/tests/dialogues.py b/tests/dialogues.py
new file mode 100644
index 000000000000..9bb8326397d2
--- /dev/null
+++ b/tests/dialogues.py
@@ -0,0 +1,277 @@
+from rasa.shared.core.conversation import Dialogue
+from rasa.shared.core.events import (
+    SlotSet,
+    UserUttered,
+    ActionExecuted,
+    ActiveLoop,
+    BotUttered,
+)
+
+
+TEST_DEFAULT_DIALOGUE = Dialogue(
+    name="default",
+    events=[
+        ActionExecuted(action_name="action_listen", timestamp=1551952977.4850519,),
+        UserUttered(
+            entities=[{"end": 19, "entity": "name", "start": 14, "value": "Peter"}],
+            intent={"confidence": 0.0, "name": "greet"},
+            message_id=None,
+            parse_data={
+                "entities": [
+                    {"end": 19, "entity": "name", "start": 14, "value": "Peter"}
+                ],
+                "intent": {"confidence": 0.0, "name": "greet"},
+                "message_id": None,
+                "metadata": {},
+                "text": "Hi my name is Peter",
+            },
+            text="Hi my name is Peter",
+            timestamp=1551953035.076376,
+        ),
+        SlotSet(key="name", timestamp=1551953035.076385, value="Peter"),
+        ActionExecuted(action_name="utter_greet", timestamp=1551953040.607782,),
+        BotUttered(
+            data={"attachment": None, "buttons": None, "elements": None},
+            text="hey there Peter!",
+            timestamp=1551953040.60779,
+        ),
+    ],
+)
+TEST_FORMBOT_DIALOGUE = Dialogue(
+    name="formbot",
+    events=[
+        ActionExecuted(action_name="action_listen", timestamp=1551884035.892855,),
+        UserUttered(
+            intent={"confidence": 0.3748943507671356, "name": "greet"},
+            parse_data={
+                "entities": [],
+                "intent": {"confidence": 0.3748943507671356, "name": "greet"},
+                "text": "Hi I'm desperate to talk to you",
+            },
+            text="Hi I'm desperate to talk to you",
+            timestamp=1551884050.259948,
+        ),
+        ActionExecuted(
+            action_name="utter_greet",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551884060.466681,
+        ),
+        BotUttered(
+            data={"attachment": None, "buttons": None, "elements": None},
+            text="Hello! I am restaurant search assistant! How can I help?",
+            timestamp=1551884060.46669,
+        ),
+        ActionExecuted(
+            action_name="action_listen",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551884061.9350882,
+        ),
+        UserUttered(
+            entities=[
+                {"end": 18, "entity": "cuisine", "start": 16, "value": "an"},
+                {"end": 48, "entity": "location", "start": 42, "value": "Bombay"},
+            ],
+            intent={"confidence": 0.9414282441139221, "name": "request_restaurant"},
+            parse_data={
+                "entities": [
+                    {"end": 18, "entity": "cuisine", "start": 16, "value": "an"},
+                    {"end": 48, "entity": "location", "start": 42, "value": "Bombay"},
+                ],
+                "intent": {
+                    "confidence": 0.9414282441139221,
+                    "name": "request_restaurant",
+                },
+                "text": "I'm looking for an indian restaurant...in Bombay",
+            },
+            text="I'm looking for an indian restaurant...in Bombay",
+            timestamp=1551884090.9653602,
+        ),
+        ActionExecuted(
+            action_name="restaurant_form",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551884095.542748,
+        ),
+        ActionExecuted(
+            action_name="utter_slots_values",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551884097.570883,
+        ),
+        BotUttered(
+            data={"attachment": None, "buttons": None, "elements": None},
+            text=(
+                "I am going to run a restaurant search "
+                "using the following parameters:\n"
+                " - cuisine: None\n - num_people: None\n"
+                " - outdoor_seating: None\n"
+                " - preferences: None\n - feedback: None"
+            ),
+            timestamp=1551884097.57089,
+        ),
+        ActionExecuted(
+            action_name="action_listen",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551884098.8006358,
+        ),
+        UserUttered(
+            intent={"confidence": 0.2287036031484604, "name": "affirm"},
+            parse_data={
+                "entities": [],
+                "intent": {"confidence": 0.2287036031484604, "name": "affirm"},
+                "text": "Let's just pretend everything went correctly",
+            },
+            text="Let's just pretend everything went correctly",
+            timestamp=1551884208.092693,
+        ),
+        ActionExecuted(
+            action_name="action_deactivate_loop", timestamp=1551884214.951055,
+        ),
+        ActiveLoop(name=None, timestamp=1551884214.9510589),
+        SlotSet(key="requested_slot", timestamp=1551884214.951062, value=None),
+        ActionExecuted(
+            action_name="action_listen",
+            confidence=0.7680902069097734,
+            policy="policy_0_TEDPolicy",
+            timestamp=1551884216.705635,
+        ),
+    ],
+)
+TEST_MOODBOT_DIALOGUE = Dialogue(
+    name="moodbot",
+    events=[
+        ActionExecuted(action_name="action_listen", timestamp=1551883958.346432,),
+        UserUttered(
+            intent={"confidence": 0.44488201660555066, "name": "greet"},
+            parse_data={
+                "entities": [],
+                "intent": {"confidence": 0.44488201660555066, "name": "greet"},
+                "intent_ranking": [
+                    {"confidence": 0.44488201660555066, "name": "greet"},
+                    {"confidence": 0.29023286595689257, "name": "goodbye"},
+                    {"confidence": 0.10501227521380094, "name": "mood_great"},
+                    {"confidence": 0.06879303900502878, "name": "mood_unhappy"},
+                    {"confidence": 0.04903582960375451, "name": "deny"},
+                    {"confidence": 0.04204397361497238, "name": "affirm"},
+                ],
+                "text": "Hi talk to me",
+            },
+            text="Hi talk to me",
+            timestamp=1551883971.410778,
+        ),
+        ActionExecuted(
+            action_name="utter_greet",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551883975.6456478,
+        ),
+        BotUttered(
+            data={
+                "attachment": None,
+                "buttons": [
+                    {"payload": "great", "title": "great"},
+                    {"payload": "super sad", "title": "super sad"},
+                ],
+                "elements": None,
+            },
+            text="Hey! How are you?",
+            timestamp=1551883975.645656,
+        ),
+        ActionExecuted(
+            action_name="action_listen",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551883979.098331,
+        ),
+        UserUttered(
+            intent={"confidence": 0.7417646502470048, "name": "mood_unhappy"},
+            parse_data={
+                "entities": [],
+                "intent": {"confidence": 0.7417646502470048, "name": "mood_unhappy"},
+                "intent_ranking": [
+                    {"confidence": 0.7417646502470048, "name": "mood_unhappy"},
+                    {"confidence": 0.1439688162980615, "name": "mood_great"},
+                    {"confidence": 0.04577343822867981, "name": "goodbye"},
+                    {"confidence": 0.037760394267609965, "name": "greet"},
+                    {"confidence": 0.017715563733253295, "name": "affirm"},
+                    {"confidence": 0.013017137225390567, "name": "deny"},
+                ],
+                "text": "Super sad",
+            },
+            text="Super sad",
+            timestamp=1551883982.540276,
+        ),
+        ActionExecuted(
+            action_name="utter_cheer_up",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551883985.031668,
+        ),
+        BotUttered(
+            data={
+                "attachment": "https://i.imgur.com/nGF1K8f.jpg",
+                "buttons": None,
+                "elements": None,
+            },
+            text="Here is something to cheer you up:",
+            timestamp=1551883985.0316749,
+        ),
+        ActionExecuted(
+            action_name="utter_did_that_help",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551883985.940413,
+        ),
+        BotUttered(
+            data={"attachment": None, "buttons": None, "elements": None},
+            text="Did that help you?",
+            timestamp=1551883985.940421,
+        ),
+        ActionExecuted(
+            action_name="action_listen",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551883986.958556,
+        ),
+        UserUttered(
+            intent={"confidence": 0.8162296627642036, "name": "deny"},
+            parse_data={
+                "entities": [],
+                "intent": {"confidence": 0.8162296627642036, "name": "deny"},
+                "intent_ranking": [
+                    {"confidence": 0.8162296627642036, "name": "deny"},
+                    {"confidence": 0.07152463661481759, "name": "mood_unhappy"},
+                    {"confidence": 0.05028159510181415, "name": "greet"},
+                    {"confidence": 0.02662414324721426, "name": "affirm"},
+                    {"confidence": 0.024343883584915963, "name": "goodbye"},
+                    {"confidence": 0.010996078687034375, "name": "mood_great"},
+                ],
+                "text": "No",
+            },
+            text="No",
+            timestamp=1551883989.0720608,
+        ),
+        ActionExecuted(
+            action_name="utter_goodbye",
+            confidence=1.0,
+            policy="policy_2_MemoizationPolicy",
+            timestamp=1551883991.061463,
+        ),
+        BotUttered(
+            data={"attachment": None, "buttons": None, "elements": None},
+            text="Bye",
+            timestamp=1551883991.061471,
+        ),
+    ],
+)
+
+TEST_DIALOGUES = [TEST_DEFAULT_DIALOGUE, TEST_FORMBOT_DIALOGUE, TEST_MOODBOT_DIALOGUE]
+
+TEST_DOMAINS_FOR_DIALOGUES = [
+    "data/test_domains/default_with_slots.yml",
+    "examples/formbot/domain.yml",
+    "data/test_moodbot/domain.yml",
+]
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index 1690c5a2d7d2..c19756a4a979 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -69,7 +69,7 @@ def pretrained_embeddings_convert_config() -> RasaNLUModelConfig:
         {
             "language": "en",
             "pipeline": [
-                {"name": "ConveRTTokenizer"},
+                {"name": "WhitespaceTokenizer"},
                 {"name": "ConveRTFeaturizer"},
                 {"name": "DIETClassifier", EPOCHS: 1, RANDOM_SEED: 42},
             ],
diff --git a/tests/nlu/featurizers/test_lm_featurizer.py b/tests/nlu/featurizers/test_lm_featurizer.py
index e5d771e81017..452fbca01abd 100644
--- a/tests/nlu/featurizers/test_lm_featurizer.py
+++ b/tests/nlu/featurizers/test_lm_featurizer.py
@@ -10,16 +10,11 @@
 from rasa.nlu.constants import (
     TOKENS_NAMES,
     NUMBER_OF_SUB_TOKENS,
-    SEQUENCE_FEATURES,
-    SENTENCE_FEATURES,
-    LANGUAGE_MODEL_DOCS,
 )
-from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.shared.nlu.training_data.training_data import TrainingData
 from rasa.shared.nlu.training_data.message import Message
 from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
-from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.shared.nlu.constants import TEXT, INTENT
 
 
@@ -655,52 +650,6 @@ def test_attention_mask(
     assert np.all(mask_zeros == 0)
 
 
-def test_log_deprecation_warning_with_old_config(caplog: LogCaptureFixture):
-    message = Message.build("hi there")
-
-    transformers_nlp = HFTransformersNLP(
-        {"model_name": "bert", "model_weights": "bert-base-uncased"}
-    )
-    transformers_nlp.process(message)
-
-    caplog.set_level(logging.DEBUG)
-    lm_tokenizer = LanguageModelTokenizer()
-    lm_tokenizer.process(message)
-    lm_featurizer = LanguageModelFeaturizer(skip_model_load=True)
-    caplog.clear()
-    with caplog.at_level(logging.DEBUG):
-        lm_featurizer.process(message)
-
-    assert "deprecated component HFTransformersNLP" in caplog.text
-
-
-def test_preserve_sentence_and_sequence_features_old_config():
-    attribute = TEXT
-    message = Message.build("hi there")
-
-    transformers_nlp = HFTransformersNLP(
-        {"model_name": "bert", "model_weights": "bert-base-uncased"}
-    )
-    transformers_nlp.process(message)
-    lm_tokenizer = LanguageModelTokenizer()
-    lm_tokenizer.process(message)
-
-    lm_featurizer = LanguageModelFeaturizer({"model_name": "gpt2"})
-    lm_featurizer.process(message)
-
-    message.set(LANGUAGE_MODEL_DOCS[attribute], None)
-    lm_docs = lm_featurizer._get_docs_for_batch(
-        [message], attribute=attribute, inference_mode=True
-    )[0]
-    hf_docs = transformers_nlp._get_docs_for_batch(
-        [message], attribute=attribute, inference_mode=True
-    )[0]
-    assert not (message.features[0].features == lm_docs[SEQUENCE_FEATURES]).any()
-    assert not (message.features[1].features == lm_docs[SENTENCE_FEATURES]).any()
-    assert (message.features[0].features == hf_docs[SEQUENCE_FEATURES]).all()
-    assert (message.features[1].features == hf_docs[SENTENCE_FEATURES]).all()
-
-
 @pytest.mark.parametrize(
     "text, tokens, expected_feature_tokens",
     [
diff --git a/tests/nlu/test_train.py b/tests/nlu/test_train.py
index 4569798e5260..9d93593964c0 100644
--- a/tests/nlu/test_train.py
+++ b/tests/nlu/test_train.py
@@ -12,7 +12,6 @@
 COMPONENTS_TEST_PARAMS = {
     "DIETClassifier": {EPOCHS: 1},
     "ResponseSelector": {EPOCHS: 1},
-    "HFTransformersNLP": {"model_name": "bert", "model_weights": "bert-base-uncased"},
     "LanguageModelFeaturizer": {
         "model_name": "bert",
         "model_weights": "bert-base-uncased",
@@ -73,10 +72,7 @@ def pipelines_for_tests() -> List[Tuple[Text, List[Dict[Text, Any]]]]:
         (
             "en",
             as_pipeline(
-                "HFTransformersNLP",
-                "LanguageModelTokenizer",
-                "LanguageModelFeaturizer",
-                "DIETClassifier",
+                "WhitespaceTokenizer", "LanguageModelFeaturizer", "DIETClassifier",
             ),
         ),
         ("fallback", as_pipeline("KeywordIntentClassifier", "FallbackClassifier")),
@@ -130,7 +126,7 @@ def test_all_components_are_in_at_least_one_test_pipeline():
     for cls in registry.component_classes:
         if "convert" in cls.name.lower():
             # TODO
-            #   skip ConveRTTokenizer and ConveRTFeaturizer as the ConveRT model is not
+            #   skip ConveRTFeaturizer as the ConveRT model is not
             #   publicly available anymore
             #   (see https://github.com/RasaHQ/rasa/issues/6806)
             continue
diff --git a/tests/nlu/utils/test_hf_transformers.py b/tests/nlu/utils/test_hf_transformers.py
deleted file mode 100644
index 8531e59023b4..000000000000
--- a/tests/nlu/utils/test_hf_transformers.py
+++ /dev/null
@@ -1,413 +0,0 @@
-import pytest
-import numpy as np
-from typing import List, Text, Tuple
-import logging
-
-from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
-from rasa.shared.nlu.training_data.message import Message
-from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-from rasa.nlu.constants import LANGUAGE_MODEL_DOCS
-from rasa.shared.nlu.constants import TEXT
-from rasa.shared.nlu.training_data.training_data import TrainingData
-
-# this would normally go into conftest, but HFTransformers is set to be
-# deprecated and it only makes sense for this and LanguageModelFeaturizer
-from tests.nlu.featurizers.test_lm_featurizer import (
-    create_pretrained_transformers_config,
-)
-
-
-@pytest.mark.parametrize(
-    "input_sequence_length, model_name, should_overflow",
-    [(20, "bert", False), (1000, "bert", True), (1000, "xlnet", False)],
-)
-def test_sequence_length_overflow_train(
-    input_sequence_length: int, model_name: Text, should_overflow: bool
-):
-    component = HFTransformersNLP({"model_name": model_name}, skip_model_load=True)
-    message = Message.build(text=" ".join(["hi"] * input_sequence_length))
-    if should_overflow:
-        with pytest.raises(RuntimeError):
-            component._validate_sequence_lengths(
-                [input_sequence_length], [message], "text", inference_mode=False
-            )
-    else:
-        component._validate_sequence_lengths(
-            [input_sequence_length], [message], "text", inference_mode=False
-        )
-
-
-@pytest.mark.parametrize(
-    "sequence_embeddings, actual_sequence_lengths, model_name, padding_needed",
-    [
-        (np.ones((1, 512, 5)), [1000], "bert", True),
-        (np.ones((1, 512, 5)), [1000], "xlnet", False),
-        (np.ones((1, 256, 5)), [256], "bert", False),
-    ],
-)
-def test_long_sequences_extra_padding(
-    sequence_embeddings: np.ndarray,
-    actual_sequence_lengths: List[int],
-    model_name: Text,
-    padding_needed: bool,
-):
-    component = HFTransformersNLP({"model_name": model_name}, skip_model_load=True)
-    modified_sequence_embeddings = component._add_extra_padding(
-        sequence_embeddings, actual_sequence_lengths
-    )
-    if not padding_needed:
-        assert np.all(modified_sequence_embeddings) == np.all(sequence_embeddings)
-    else:
-        assert modified_sequence_embeddings.shape[1] == actual_sequence_lengths[0]
-        assert (
-            modified_sequence_embeddings[0].shape[-1]
-            == sequence_embeddings[0].shape[-1]
-        )
-        zero_embeddings = modified_sequence_embeddings[0][
-            sequence_embeddings.shape[1] :
-        ]
-        assert np.all(zero_embeddings == 0)
-
-
-@pytest.mark.parametrize(
-    "token_ids, max_sequence_length_model, resulting_length, padding_added",
-    [
-        ([[1] * 200], 512, 512, True),
-        ([[1] * 700], 512, 512, False),
-        ([[1] * 200], 200, 200, False),
-    ],
-)
-def test_input_padding(
-    token_ids: List[List[int]],
-    max_sequence_length_model: int,
-    resulting_length: int,
-    padding_added: bool,
-):
-    component = HFTransformersNLP(
-        {"model_name": "bert", "model_weights": "bert-base-uncased"},
-        skip_model_load=True,
-    )
-    component.pad_token_id = 0
-    padded_input = component._add_padding_to_batch(token_ids, max_sequence_length_model)
-    assert len(padded_input[0]) == resulting_length
-    if padding_added:
-        original_length = len(token_ids[0])
-        assert np.all(np.array(padded_input[0][original_length:]) == 0)
-
-
-@pytest.mark.parametrize(
-    "sequence_length, model_name, should_overflow",
-    [(1000, "bert", True), (256, "bert", False)],
-)
-def test_log_longer_sequence(
-    sequence_length: int, model_name: Text, should_overflow: bool, caplog
-):
-    transformers_config = {
-        "model_name": model_name,
-        "model_weights": "bert-base-uncased",
-    }
-
-    transformers_nlp = HFTransformersNLP(transformers_config)
-
-    text = " ".join(["hi"] * sequence_length)
-    message = Message.build(text)
-
-    caplog.set_level(logging.DEBUG)
-    transformers_nlp.process(message)
-    if should_overflow:
-        assert "hi hi hi" in caplog.text
-    assert message.get("text_language_model_doc") is not None
-
-
-@pytest.mark.parametrize(
-    "actual_sequence_length, max_input_sequence_length, zero_start_index",
-    [(256, 512, 256), (700, 700, 700), (700, 512, 512)],
-)
-def test_attention_mask(
-    actual_sequence_length: int, max_input_sequence_length: int, zero_start_index: int
-):
-    component = HFTransformersNLP(
-        {"model_name": "bert", "model_weights": "bert-base-uncased"},
-        skip_model_load=True,
-    )
-
-    attention_mask = component._compute_attention_mask(
-        [actual_sequence_length], max_input_sequence_length
-    )
-    mask_ones = attention_mask[0][:zero_start_index]
-    mask_zeros = attention_mask[0][zero_start_index:]
-
-    assert np.all(mask_ones == 1)
-    assert np.all(mask_zeros == 0)
-
-
-def train_texts(
-    texts: List[Text], model_name: Text, model_weights: Text
-) -> List[Message]:
-    config = create_pretrained_transformers_config(model_name, model_weights)
-    whitespace_tokenizer = WhitespaceTokenizer()
-    transformer = HFTransformersNLP(config)
-
-    messages = [Message.build(text=text) for text in texts]
-    td = TrainingData(messages)
-
-    whitespace_tokenizer.train(td)
-    transformer.train(td)
-    return messages
-
-
-def process_texts(
-    texts: List[Text], model_name: Text, model_weights: Text
-) -> List[Message]:
-    config = create_pretrained_transformers_config(model_name, model_weights)
-    whitespace_tokenizer = WhitespaceTokenizer()
-    transformer = HFTransformersNLP(config)
-
-    messages = []
-    for text in texts:
-        message = Message.build(text=text)
-        whitespace_tokenizer.process(message)
-        transformer.process(message)
-        messages.append(message)
-    return messages
-
-
-@pytest.mark.parametrize(
-    "model_name, model_weights, texts, expected_shape, "
-    "expected_sequence_vec, expected_cls_vec",
-    [
-        (
-            "bert",
-            None,
-            ["Good evening.", "here is the sentence I want embeddings for."],
-            [(3, 768), (9, 768)],
-            [
-                [0.6569931, 0.77279466],
-                [0.21718428, 0.34955627, 0.59124136, 0.6869872, 0.16993292],
-            ],
-            [
-                [0.29528213, 0.5543281, -0.4091331, 0.65817744, 0.81740487],
-                [-0.17215663, 0.26811457, -0.1922609, -0.63926417, -1.626383],
-            ],
-        ),
-        (
-            "bert",
-            "bert-base-uncased",
-            ["Good evening.", "here is the sentence I want embeddings for."],
-            [(3, 768), (9, 768)],
-            [
-                [0.57274431, -0.16078192],
-                [-0.54851216, 0.09632845, -0.42788929, 0.11438307, 0.18316516],
-            ],
-            [
-                [0.06880389, 0.32802248, -0.11250392, -0.11338016, -0.37116382],
-                [0.05909365, 0.06433402, 0.08569094, -0.16530040, -0.11396892],
-            ],
-        ),
-        (
-            "gpt",
-            None,
-            ["Good evening.", "here is the sentence I want embeddings for."],
-            [(3, 768), (9, 768)],
-            [
-                [-0.0630323737859726, 0.4029877185821533],
-                [
-                    0.8072432279586792,
-                    -0.08990508317947388,
-                    0.9985930919647217,
-                    -0.38779014348983765,
-                    0.08921952545642853,
-                ],
-            ],
-            [
-                [
-                    0.16997766494750977,
-                    0.1493849903345108,
-                    0.39421725273132324,
-                    -0.5753618478775024,
-                    0.05096133053302765,
-                ],
-                [
-                    0.41056010127067566,
-                    -0.1169343888759613,
-                    -0.3019704818725586,
-                    -0.40207183361053467,
-                    0.6289798021316528,
-                ],
-            ],
-        ),
-        (
-            "gpt2",
-            None,
-            ["Good evening.", "here is the sentence I want embeddings for."],
-            [(3, 768), (9, 768)],
-            [
-                [-0.03382749, -0.05373593],
-                [-0.18434484, -0.5386464, -0.11122551, -0.95434338, 0.28311089],
-            ],
-            [
-                [
-                    -0.04710008203983307,
-                    -0.2793063223361969,
-                    -0.23804056644439697,
-                    -0.3212292492389679,
-                    0.11430201679468155,
-                ],
-                [
-                    -0.1809544414281845,
-                    -0.017152192071080208,
-                    -0.3176477551460266,
-                    -0.008387327194213867,
-                    0.3365338146686554,
-                ],
-            ],
-        ),
-        (
-            "xlnet",
-            None,
-            ["Good evening.", "here is the sentence I want embeddings for."],
-            [(3, 768), (9, 768)],
-            [
-                [1.7612367868423462, 2.5819129943847656],
-                [
-                    0.784195065498352,
-                    0.7068007588386536,
-                    1.5883606672286987,
-                    1.891886591911316,
-                    2.5209126472473145,
-                ],
-            ],
-            [
-                [
-                    2.171574831008911,
-                    -1.5377449989318848,
-                    -3.2671749591827393,
-                    0.22520869970321655,
-                    -1.598855972290039,
-                ],
-                [
-                    1.6516317129135132,
-                    0.021670114248991013,
-                    -2.5114030838012695,
-                    1.447351098060608,
-                    -2.5866634845733643,
-                ],
-            ],
-        ),
-        (
-            "distilbert",
-            None,
-            ["Good evening.", "here is the sentence I want embeddings for."],
-            [(3, 768), (9, 768)],
-            [
-                [0.22866562008857727, -0.0575055330991745],
-                [
-                    -0.6448041796684265,
-                    -0.5105321407318115,
-                    -0.4892978072166443,
-                    0.17531153559684753,
-                    0.22717803716659546,
-                ],
-            ],
-            [
-                [
-                    -0.09814466536045074,
-                    -0.07325993478298187,
-                    0.22358475625514984,
-                    -0.20274735987186432,
-                    -0.07363069802522659,
-                ],
-                [
-                    -0.146609365940094,
-                    -0.07373693585395813,
-                    0.016850866377353668,
-                    -0.2407529354095459,
-                    -0.0979844480752945,
-                ],
-            ],
-        ),
-        (
-            "roberta",
-            None,
-            ["Good evening.", "here is the sentence I want embeddings for."],
-            [(3, 768), (9, 768)],
-            [
-                [-0.3092685, 0.09567838],
-                [0.02152853, -0.08026707, -0.1080862, 0.12423468, -0.05378958],
-            ],
-            [
-                [
-                    -0.03930358216166496,
-                    0.034788478165864944,
-                    0.12246038764715195,
-                    0.08401528000831604,
-                    0.7026961445808411,
-                ],
-                [
-                    -0.018586941063404083,
-                    -0.09835464507341385,
-                    0.03242188319563866,
-                    0.09366855770349503,
-                    0.4458026587963104,
-                ],
-            ],
-        ),
-    ],
-)
-class TestShapeValuesTrainAndProcess:
-    @staticmethod
-    def evaluate_message_shape_values(
-        messages: List[Message],
-        expected_shape: List[tuple],
-        expected_sequence_vec: List[List[float]],
-        expected_cls_vec: List[List[float]],
-    ) -> None:
-        for index in range(len(messages)):
-            lm_docs = messages[index].get(LANGUAGE_MODEL_DOCS[TEXT])
-            computed_sequence_vec = lm_docs["sequence_features"]
-            computed_sentence_vec = lm_docs["sentence_features"]
-
-            assert computed_sequence_vec.shape[0] == expected_shape[index][0] - 1
-            assert computed_sequence_vec.shape[1] == expected_shape[index][1]
-            assert computed_sentence_vec.shape[0] == 1
-            assert computed_sentence_vec.shape[1] == expected_shape[index][1]
-
-            # Look at the value of first dimension for a few starting timesteps
-            assert np.allclose(
-                computed_sequence_vec[: len(expected_sequence_vec[index]), 0],
-                expected_sequence_vec[index],
-                atol=1e-4,
-            )
-
-            # Look at the first value of first five dimensions
-            assert np.allclose(
-                computed_sentence_vec[0][:5], expected_cls_vec[index], atol=1e-4
-            )
-
-    def test_hf_transformers_shape_values_train(
-        self,
-        model_name: Text,
-        model_weights: Text,
-        texts: List[Text],
-        expected_shape: List[Tuple[int]],
-        expected_sequence_vec: List[List[float]],
-        expected_cls_vec: List[List[float]],
-    ):
-        messages = train_texts(texts, model_name, model_weights)
-        self.evaluate_message_shape_values(
-            messages, expected_shape, expected_sequence_vec, expected_cls_vec
-        )
-
-    def test_hf_transformers_shape_values_process(
-        self,
-        model_name: Text,
-        model_weights: Text,
-        texts: List[Text],
-        expected_shape: List[Tuple[int]],
-        expected_sequence_vec: List[List[float]],
-        expected_cls_vec: List[List[float]],
-    ):
-        messages = process_texts(texts, model_name, model_weights)
-        self.evaluate_message_shape_values(
-            messages, expected_shape, expected_sequence_vec, expected_cls_vec
-        )
diff --git a/tests/shared/core/test_dialogues.py b/tests/shared/core/test_dialogues.py
index 0018d9e53c11..9928a9c0703e 100644
--- a/tests/shared/core/test_dialogues.py
+++ b/tests/shared/core/test_dialogues.py
@@ -1,34 +1,23 @@
 import json
 
-import jsonpickle
 import pytest
 
-import rasa.shared.utils.io
-import rasa.utils.io
 from rasa.shared.core.conversation import Dialogue
 from rasa.shared.core.domain import Domain
 from rasa.core.tracker_store import InMemoryTrackerStore
-from tests.conftest import (
+from tests.dialogues import (
     TEST_DIALOGUES,
-    EXAMPLE_DOMAINS,
+    TEST_DEFAULT_DIALOGUE,
+    TEST_DOMAINS_FOR_DIALOGUES,
 )
-from tests.core.utilities import tracker_from_dialogue_file
+from tests.core.utilities import tracker_from_dialogue
 
 
-@pytest.mark.parametrize("filename", TEST_DIALOGUES)
-def test_dialogue_serialisation(filename, domain: Domain):
-    dialogue_json = rasa.shared.utils.io.read_file(filename)
-    restored = json.loads(dialogue_json)
-    tracker = tracker_from_dialogue_file(filename, domain)
-    en_de_coded = json.loads(jsonpickle.encode(tracker.as_dialogue()))
-    assert restored == en_de_coded
-
-
-@pytest.mark.parametrize("pair", zip(TEST_DIALOGUES, EXAMPLE_DOMAINS))
+@pytest.mark.parametrize("pair", zip(TEST_DIALOGUES, TEST_DOMAINS_FOR_DIALOGUES))
 def test_inmemory_tracker_store(pair):
-    filename, domainpath = pair
+    dialogue, domainpath = pair
     domain = Domain.load(domainpath)
-    tracker = tracker_from_dialogue_file(filename, domain)
+    tracker = tracker_from_dialogue(dialogue, domain)
     tracker_store = InMemoryTrackerStore(domain)
     tracker_store.save(tracker)
     restored = tracker_store.retrieve(tracker.sender_id)
@@ -36,15 +25,13 @@ def test_inmemory_tracker_store(pair):
 
 
 def test_tracker_default(domain: Domain):
-    filename = "data/test_dialogues/default.json"
-    tracker = tracker_from_dialogue_file(filename, domain)
+    tracker = tracker_from_dialogue(TEST_DEFAULT_DIALOGUE, domain)
     assert tracker.get_slot("name") == "Peter"
     assert tracker.get_slot("price") is None  # slot doesn't exist!
 
 
 def test_dialogue_from_parameters(domain: Domain):
-    filename = "data/test_dialogues/default.json"
-    tracker = tracker_from_dialogue_file(filename, domain)
+    tracker = tracker_from_dialogue(TEST_DEFAULT_DIALOGUE, domain)
     serialised_dialogue = InMemoryTrackerStore.serialise_tracker(tracker)
     deserialised_dialogue = Dialogue.from_parameters(json.loads(serialised_dialogue))
     assert tracker.as_dialogue().as_dict() == deserialised_dialogue.as_dict()
diff --git a/tests/shared/core/test_trackers.py b/tests/shared/core/test_trackers.py
index 4b3f80b33b97..400dc234c3f9 100644
--- a/tests/shared/core/test_trackers.py
+++ b/tests/shared/core/test_trackers.py
@@ -58,13 +58,13 @@
 from rasa.core.tracker_store import TrackerStore
 from rasa.shared.core.trackers import DialogueStateTracker, EventVerbosity
 from tests.core.conftest import MockedMongoTrackerStore
-from tests.conftest import (
-    EXAMPLE_DOMAINS,
+from tests.dialogues import (
     TEST_DIALOGUES,
+    TEST_MOODBOT_DIALOGUE,
+    TEST_DOMAINS_FOR_DIALOGUES,
 )
 from tests.core.utilities import (
-    tracker_from_dialogue_file,
-    read_dialogue_file,
+    tracker_from_dialogue,
     user_uttered,
     get_tracker,
 )
@@ -99,13 +99,14 @@ def stores_to_be_tested_ids():
     return ["redis-tracker", "in-memory-tracker", "SQL-tracker", "mongo-tracker"]
 
 
-def test_tracker_duplicate():
-    filename = "data/test_dialogues/moodbot.json"
-    dialogue = read_dialogue_file(filename)
-    tracker = DialogueStateTracker(dialogue.name, test_domain.slots)
-    tracker.recreate_from_dialogue(dialogue)
+def test_tracker_duplicate(moodbot_domain: Domain):
+    tracker = tracker_from_dialogue(TEST_MOODBOT_DIALOGUE, moodbot_domain)
     num_actions = len(
-        [event for event in dialogue.events if isinstance(event, ActionExecuted)]
+        [
+            event
+            for event in TEST_MOODBOT_DIALOGUE.events
+            if isinstance(event, ActionExecuted)
+        ]
     )
 
     # There is always one duplicated tracker more than we have actions,
@@ -142,20 +143,18 @@ def test_tracker_store_storage_and_retrieval(store: TrackerStore):
 
 
 @pytest.mark.parametrize("store", stores_to_be_tested(), ids=stores_to_be_tested_ids())
-@pytest.mark.parametrize("pair", zip(TEST_DIALOGUES, EXAMPLE_DOMAINS))
+@pytest.mark.parametrize("pair", zip(TEST_DIALOGUES, TEST_DOMAINS_FOR_DIALOGUES))
 def test_tracker_store(store, pair):
-    filename, domainpath = pair
+    dialogue, domainpath = pair
     domain = Domain.load(domainpath)
-    tracker = tracker_from_dialogue_file(filename, domain)
+    tracker = tracker_from_dialogue(dialogue, domain)
     store.save(tracker)
     restored = store.retrieve(tracker.sender_id)
     assert restored == tracker
 
 
 async def test_tracker_write_to_story(tmp_path: Path, moodbot_domain: Domain):
-    tracker = tracker_from_dialogue_file(
-        "data/test_dialogues/moodbot.json", moodbot_domain
-    )
+    tracker = tracker_from_dialogue(TEST_MOODBOT_DIALOGUE, moodbot_domain)
     p = tmp_path / "export.yml"
     tracker.export_stories_to_file(str(p))
     trackers = await training.load_data(
diff --git a/tests/test_model_training.py b/tests/test_model_training.py
index a72795dab9b8..a6f3f362e473 100644
--- a/tests/test_model_training.py
+++ b/tests/test_model_training.py
@@ -11,8 +11,10 @@
 from _pytest.logging import LogCaptureFixture
 from _pytest.monkeypatch import MonkeyPatch
 
+import rasa
 from rasa.core.policies.ted_policy import TEDPolicy
 import rasa.model
+import rasa.model_training
 import rasa.core
 import rasa.core.train
 import rasa.nlu
@@ -22,8 +24,6 @@
 from rasa.core.agent import Agent
 from rasa.core.interpreter import RasaNLUInterpreter
 from rasa.nlu.model import Interpreter
-
-from rasa.model_training import train_core, train_nlu, train, dry_run_result
 from rasa.utils.tensorflow.constants import EPOCHS
 from tests.conftest import AsyncMock
 from tests.test_model import _fingerprint
@@ -93,7 +93,7 @@ def test_train_temp_files(
     monkeypatch.setattr(tempfile, "tempdir", tmp_path / "training")
     output = str(tmp_path / "models")
 
-    train(
+    rasa.train(
         domain_path,
         stack_config_path,
         [stories_path, nlu_data_path],
@@ -106,7 +106,7 @@ def test_train_temp_files(
     # After training the model, try to do it again. This shouldn't try to train
     # a new model because nothing has been changed. It also shouldn't create
     # any temp files.
-    train(
+    rasa.train(
         domain_path, stack_config_path, [stories_path, nlu_data_path], output=output,
     )
 
@@ -125,7 +125,7 @@ def test_train_core_temp_files(
 
     monkeypatch.setattr(tempfile, "tempdir", tmp_path / "training")
 
-    train_core(
+    rasa.model_training.train_core(
         domain_path, stack_config_path, stories_path, output=str(tmp_path / "models"),
     )
 
@@ -143,7 +143,9 @@ def test_train_nlu_temp_files(
 
     monkeypatch.setattr(tempfile, "tempdir", tmp_path / "training")
 
-    train_nlu(stack_config_path, nlu_data_path, output=str(tmp_path / "models"))
+    rasa.model_training.train_nlu(
+        stack_config_path, nlu_data_path, output=str(tmp_path / "models")
+    )
 
     assert count_temp_rasa_files(tempfile.tempdir) == 0
 
@@ -160,7 +162,7 @@ def test_train_nlu_wrong_format_error_message(
 
     monkeypatch.setattr(tempfile, "tempdir", tmp_path / "training")
 
-    train_nlu(
+    rasa.model_training.train_nlu(
         stack_config_path, incorrect_nlu_data_path, output=str(tmp_path / "models")
     )
 
@@ -172,7 +174,7 @@ def test_train_nlu_with_responses_no_domain_warns(tmp_path: Path):
     data_path = "data/test_nlu_no_responses/nlu_no_responses.yml"
 
     with pytest.warns(UserWarning) as records:
-        train_nlu(
+        rasa.model_training.train_nlu(
             "data/test_config/config_response_selector_minimal.yml",
             data_path,
             output=str(tmp_path / "models"),
@@ -190,7 +192,7 @@ def test_train_nlu_with_responses_and_domain_no_warns(tmp_path: Path):
     domain_path = "data/test_nlu_no_responses/domain_with_only_responses.yml"
 
     with pytest.warns(None) as records:
-        train_nlu(
+        rasa.model_training.train_nlu(
             "data/test_config/config_response_selector_minimal.yml",
             data_path,
             output=str(tmp_path / "models"),
@@ -215,7 +217,9 @@ def test_train_nlu_no_nlu_file_error_message(
 
     monkeypatch.setattr(tempfile, "tempdir", tmp_path / "training")
 
-    train_nlu(stack_config_path, "", output=str(tmp_path / "models"))
+    rasa.model_training.train_nlu(
+        stack_config_path, "", output=str(tmp_path / "models")
+    )
 
     captured = capsys.readouterr()
     assert "No NLU data given" in captured.out
@@ -242,7 +246,7 @@ def test_trained_interpreter_passed_to_core_training(
     # Mock the actual Core training
     _train_core = mock_core_training(monkeypatch)
 
-    train(
+    rasa.train(
         domain_path, config_path, [stories_path, nlu_data_path], str(tmp_path),
     )
 
@@ -275,7 +279,7 @@ def test_interpreter_of_old_model_passed_to_core_training(
     # Mock the actual Core training
     _train_core = mock_core_training(monkeypatch)
 
-    train(
+    rasa.train(
         domain_path, config_path, [stories_path, nlu_data_path], str(tmp_path),
     )
 
@@ -315,7 +319,7 @@ def test_train_core_autoconfig(
     )
 
     # do training
-    train_core(
+    rasa.model_training.train_core(
         domain_path,
         stack_config_path,
         stories_path,
@@ -344,7 +348,7 @@ def test_train_nlu_autoconfig(
     )
 
     # do training
-    train_nlu(
+    rasa.model_training.train_nlu(
         stack_config_path, nlu_data_path, output="test_train_nlu_temp_files_models",
     )
 
@@ -390,7 +394,7 @@ def test_e2e_gives_experimental_warning(
         mock_core_training(monkeypatch)
 
         with caplog.at_level(logging.WARNING):
-            train(
+            rasa.train(
                 domain_path,
                 stack_config_path,
                 [e2e_stories_path, nlu_data_path],
@@ -416,7 +420,7 @@ def test_models_not_retrained_if_no_new_data(
         mocked_nlu_training = mock_nlu_training(monkeypatch)
         mocked_core_training = mock_core_training(monkeypatch)
 
-        train(
+        rasa.train(
             domain_path,
             stack_config_path,
             [e2e_stories_path, nlu_data_path],
@@ -445,7 +449,7 @@ def test_retrains_nlu_and_core_if_new_e2e_example(
         mocked_nlu_training = mock_nlu_training(monkeypatch)
         mocked_core_training = mock_core_training(monkeypatch)
 
-        new_model_path = train(
+        new_model_path = rasa.train(
             domain_path,
             stack_config_path,
             [new_stories_file, nlu_data_path],
@@ -475,7 +479,7 @@ def test_retrains_only_core_if_new_e2e_example_seen_before(
         mocked_nlu_training = mock_nlu_training(monkeypatch)
         mocked_core_training = mock_core_training(monkeypatch)
 
-        new_model_path = train(
+        new_model_path = rasa.train(
             domain_path,
             stack_config_path,
             [new_stories_file, nlu_data_path],
@@ -498,7 +502,7 @@ def test_nlu_and_core_trained_if_no_nlu_data_but_e2e_stories(
         mocked_core_training = mock_core_training(monkeypatch)
 
         output = self.make_tmp_model_dir(tmp_path)
-        train(
+        rasa.train(
             domain_path, stack_config_path, [e2e_stories_path], output=output,
         )
 
@@ -530,7 +534,7 @@ def test_new_nlu_data_retrains_core_if_there_are_e2e_stories(
         mocked_nlu_training = mock_nlu_training(monkeypatch)
         mocked_core_training = mock_core_training(monkeypatch)
 
-        new_model_path = train(
+        new_model_path = rasa.train(
             domain_path,
             stack_config_path,
             [e2e_stories_path, new_nlu_file],
@@ -560,7 +564,7 @@ def test_new_nlu_data_does_not_retrain_core_if_there_are_no_e2e_stories(
         mocked_nlu_training = mock_nlu_training(monkeypatch)
         mocked_core_training = mock_core_training(monkeypatch)
 
-        new_model_path = train(
+        new_model_path = rasa.train(
             domain_path,
             stack_config_path,
             [simple_stories_path, new_nlu_file],
@@ -585,7 +589,7 @@ def test_training_core_with_e2e_fails_gracefully(
         mocked_core_training = mock_core_training(monkeypatch)
 
         output = self.make_tmp_model_dir(tmp_path)
-        train_core(
+        rasa.model_training.train_core(
             domain_path, stack_config_path, e2e_stories_path, output=output,
         )
 
@@ -620,7 +624,7 @@ def test_model_finetuning(
     if use_latest_model:
         trained_rasa_model = str(Path(trained_rasa_model).parent)
 
-    train(
+    rasa.train(
         domain_path,
         stack_config_path,
         [stories_path, nlu_data_path],
@@ -674,7 +678,7 @@ def test_model_finetuning_core(
     new_stories_path = tmp_path / "new_stories.yml"
     rasa.shared.utils.io.write_yaml(old_stories, new_stories_path)
 
-    train_core(
+    rasa.model_training.train_core(
         "data/test_moodbot/domain.yml",
         str(new_config_path),
         str(new_stories_path),
@@ -707,7 +711,7 @@ def test_model_finetuning_core_with_default_epochs(
     new_config_path = tmp_path / "new_config.yml"
     rasa.shared.utils.io.write_yaml(old_config, new_config_path)
 
-    train_core(
+    rasa.model_training.train_core(
         "data/test_moodbot/domain.yml",
         str(new_config_path),
         "data/test_moodbot/data/stories.yml",
@@ -739,7 +743,7 @@ def test_model_finetuning_core_new_domain_label(
     rasa.shared.utils.io.write_yaml(old_domain, new_domain_path)
 
     with pytest.raises(SystemExit):
-        train_core(
+        rasa.model_training.train_core(
             domain=str(new_domain_path),
             config="data/test_moodbot/config.yml",
             stories="data/test_moodbot/data/stories.yml",
@@ -765,7 +769,7 @@ def test_model_finetuning_new_domain_label_stops_all_training(
     rasa.shared.utils.io.write_yaml(old_domain, new_domain_path)
 
     with pytest.raises(SystemExit):
-        train(
+        rasa.train(
             domain=str(new_domain_path),
             config="data/test_moodbot/config.yml",
             training_files=[
@@ -815,7 +819,7 @@ def test_model_finetuning_nlu(
     new_nlu_path = tmp_path / "new_nlu.yml"
     rasa.shared.utils.io.write_yaml(old_nlu, new_nlu_path)
 
-    train_nlu(
+    rasa.model_training.train_nlu(
         str(new_config_path),
         str(new_nlu_path),
         domain="data/test_moodbot/domain.yml",
@@ -853,7 +857,7 @@ def test_model_finetuning_nlu_new_label(
     rasa.shared.utils.io.write_yaml(old_nlu, new_nlu_path)
 
     with pytest.raises(SystemExit):
-        train_nlu(
+        rasa.model_training.train_nlu(
             "data/test_moodbot/config.yml",
             str(new_nlu_path),
             domain="data/test_moodbot/domain.yml",
@@ -878,7 +882,7 @@ def test_model_finetuning_nlu_new_entity(
     rasa.shared.utils.io.write_yaml(old_nlu, new_nlu_path)
 
     with pytest.raises(SystemExit):
-        train_nlu(
+        rasa.model_training.train_nlu(
             "data/test_moodbot/config.yml",
             str(new_nlu_path),
             domain="data/test_moodbot/domain.yml",
@@ -909,7 +913,7 @@ def test_model_finetuning_nlu_new_label_already_in_domain(
     rasa.shared.utils.io.write_yaml(old_nlu, new_nlu_path)
 
     with pytest.raises(SystemExit):
-        train_nlu(
+        rasa.model_training.train_nlu(
             config_path,
             str(new_nlu_path),
             domain=domain_path,
@@ -933,7 +937,7 @@ def test_model_finetuning_nlu_new_label_to_domain_only(
     new_domain_path = tmp_path / "new_domain.yml"
     rasa.shared.utils.io.write_yaml(old_domain, new_domain_path)
 
-    train_nlu(
+    rasa.model_training.train_nlu(
         "data/test_moodbot/config.yml",
         "data/test_moodbot/data/nlu.yml",
         domain=str(new_domain_path),
@@ -960,7 +964,7 @@ def test_model_finetuning_nlu_with_default_epochs(
     new_config_path = tmp_path / "new_config.yml"
     rasa.shared.utils.io.write_yaml(old_config, new_config_path)
 
-    train_nlu(
+    rasa.model_training.train_nlu(
         str(new_config_path),
         "data/test_moodbot/data/nlu.yml",
         output=output,
@@ -994,7 +998,7 @@ def test_model_finetuning_with_invalid_model(
     output = str(tmp_path / "models")
 
     with pytest.raises(SystemExit):
-        train(
+        rasa.train(
             domain_path,
             stack_config_path,
             [stories_path, nlu_data_path],
@@ -1025,7 +1029,7 @@ def test_model_finetuning_with_invalid_model_core(
     output = str(tmp_path / "models")
 
     with pytest.raises(SystemExit):
-        train_core(
+        rasa.model_training.train_core(
             domain_path,
             stack_config_path,
             stories_path,
@@ -1055,7 +1059,7 @@ def test_model_finetuning_with_invalid_model_nlu(
     output = str(tmp_path / "models")
 
     with pytest.raises(SystemExit):
-        train_nlu(
+        rasa.model_training.train_nlu(
             stack_config_path,
             nlu_data_path,
             domain=domain_path,
@@ -1112,6 +1116,6 @@ def test_model_finetuning_with_invalid_model_nlu(
 def test_dry_run_result(
     result: rasa.model.FingerprintComparisonResult, code: int, texts_count: int,
 ):
-    result_code, texts = dry_run_result(result)
+    result_code, texts = rasa.model_training.dry_run_result(result)
     assert result_code == code
     assert len(texts) == texts_count
diff --git a/tests/train.py b/tests/train.py
index f12d5bd27a4e..ea22293a0eed 100644
--- a/tests/train.py
+++ b/tests/train.py
@@ -5,7 +5,6 @@
 COMPONENTS_TEST_PARAMS = {
     "DIETClassifier": {EPOCHS: 1},
     "ResponseSelector": {EPOCHS: 1},
-    "HFTransformersNLP": {"model_name": "bert", "model_weights": "bert-base-uncased"},
     "LanguageModelFeaturizer": {
         "model_name": "bert",
         "model_weights": "bert-base-uncased",
@@ -63,10 +62,7 @@ def pipelines_for_tests() -> List[Tuple[Text, List[Dict[Text, Any]]]]:
         (
             "en",
             as_pipeline(
-                "HFTransformersNLP",
-                "LanguageModelTokenizer",
-                "LanguageModelFeaturizer",
-                "DIETClassifier",
+                "WhitespaceTokenizer", "LanguageModelFeaturizer", "DIETClassifier",
             ),
         ),
         ("fallback", as_pipeline("KeywordIntentClassifier", "FallbackClassifier")),