diff --git a/changelog/6457.improvement.md b/changelog/6457.improvement.md
new file mode 100644
index 000000000000..821a5d33ec8d
--- /dev/null
+++ b/changelog/6457.improvement.md
@@ -0,0 +1 @@
+Support for test stories written in yaml format.
diff --git a/data/test_dialogues/default.json b/data/test_dialogues/default.json
index abadf0e407ad..1a8ce03e8000 100644
--- a/data/test_dialogues/default.json
+++ b/data/test_dialogues/default.json
@@ -39,6 +39,8 @@
           "confidence": 0.0,
           "name": "greet"
         },
+        "message_id": null,
+        "metadata": {},
         "text": "Hi my name is Peter"
       },
       "text": "Hi my name is Peter",
diff --git a/data/test_endpoints/event_brokers/file_endpoint.yml b/data/test_endpoints/event_brokers/file_endpoint.yml
deleted file mode 100644
index 13c7002f397e..000000000000
--- a/data/test_endpoints/event_brokers/file_endpoint.yml
+++ /dev/null
@@ -1,3 +0,0 @@
-event_broker:
-  path: "rasa_event.log"
-  type: file
diff --git a/data/test_multi_domain/config.yml b/data/test_multi_domain/config.yml
index 61e0b5b7133b..dba37fe301bd 100644
--- a/data/test_multi_domain/config.yml
+++ b/data/test_multi_domain/config.yml
@@ -1,17 +1,10 @@
 language: en
 
 pipeline:
-  - name: SpacyNLP
-  - name: SpacyTokenizer
-  - name: SpacyFeaturizer
-  - name: RegexFeaturizer
-  - name: CRFEntityExtractor
-  - name: EntitySynonymMapper
-  - name: SklearnIntentClassifier
+  - name: "KeywordIntentClassifier"
 
 policies:
   - name: MemoizationPolicy
-  - name: TEDPolicy
 
 importers:
   - name: MultiProjectImporter
diff --git a/data/test_trackers/tracker_moodbot.json b/data/test_trackers/tracker_moodbot.json
index 4103cedf0f66..634047470629 100644
--- a/data/test_trackers/tracker_moodbot.json
+++ b/data/test_trackers/tracker_moodbot.json
@@ -5,6 +5,8 @@
       "confidence": 0.60,
       "name": "mood_great"
     },
+    "message_id": null,
+    "metadata": {},
     "text": "/mood_great",
     "intent_ranking": [
       {
@@ -46,6 +48,8 @@
           "confidence": 0.54,
           "name": "greet"
         },
+        "message_id": null,
+        "metadata": {},
         "text": "/greet",
         "intent_ranking": [
           {
@@ -89,6 +93,8 @@
           "confidence": 0.60,
           "name": "mood_great"
         },
+        "message_id": null,
+        "metadata": {},
         "text": "/mood_great",
         "intent_ranking": [
           {
diff --git a/docs/docs/business-logic.mdx b/docs/docs/business-logic.mdx
index ebc006fd2447..79c187fb6f27 100644
--- a/docs/docs/business-logic.mdx
+++ b/docs/docs/business-logic.mdx
@@ -292,7 +292,7 @@ Here's a minimal checklist of files we modified to handle business logic using a
 
 * `actions.py`: Define the form action, including the `required_slots`, `slot_mappings` and `submit` methods
 
-* `data/nlu.md`:
+* `data/nlu.yml`:
 
     * Add examples for an intent to activate the form
 
@@ -308,7 +308,7 @@ Here's a minimal checklist of files we modified to handle business logic using a
 
     * Add all intents and entities from your NLU training data
 
-* `data/stories.md`: Add a story for the form
+* `data/stories.yml`: Add a story for the form
 
 * `config.yml`:
 
diff --git a/docs/docs/chitchat-faqs.mdx b/docs/docs/chitchat-faqs.mdx
index 53904b852f2d..0cb89973bbe2 100644
--- a/docs/docs/chitchat-faqs.mdx
+++ b/docs/docs/chitchat-faqs.mdx
@@ -6,109 +6,71 @@ title: Chitchat and FAQs
 
 import useBaseUrl from '@docusaurus/useBaseUrl';
 
-<!-- TODO: Restructure all conversations patterns pages to not depend on one another -->
-
-After following the basics of [prototyping an assistant](./prototype-an-assistant.mdx), we'll
-now walk through building a basic FAQ chatbot and then build a bot that can handle
-contextual conversations.
-
-<a aria-hidden="true" tabIndex="-1" className="anchor enhancedAnchor" id="build-faq-assistant"></a>
-
 FAQ assistants are the simplest assistants to build and a good place to get started.
-These assistants allow the user to ask a simple question and get a response. We're going to
-build a basic FAQ assistant using features of Rasa designed specifically for this type of assistant.
+These assistants allow the user to ask a simple question and get a response. You're
+going to build a basic FAQ assistant using features of Rasa designed specifically
+for this type of assistant. You can handle chitchat in the same way.
 
-In this section we're going to cover the following topics:
+In this section you will read about the following topics:
 
-* [Responding to simple intents](./chitchat-faqs.mdx#respond-with-memoization-policy) with the MemoizationPolicy
+* [Responding to simple messages](./chitchat-faqs.mdx#responding-to-simple-messages)
+  with the `MemoizationPolicy`
 
-* [Handling FAQs](./chitchat-faqs.mdx#faqs-response-selector) using the ResponseSelector
+* [Handling FAQs](./chitchat-faqs.mdx#handling-faqs-using-a-response-selector)
+  using a `ResponseSelector`
 
-We're going to use content from [Sara](https://github.com/RasaHQ/rasa-demo), the Rasa
-assistant that, amongst other things, helps the user get started with the Rasa products.
-You should [first install Rasa](installation.mdx)
-and then [prototype an assistant](prototype-an-assistant.mdx)
-to make sure you know the basics.
+## Prerequisites
 
-To prepare for this tutorial, we're going to start a new Rasa project:
+You should [first install Rasa](installation.mdx) and
+[prototype an assistant](prototype-an-assistant.mdx) to make sure
+you understand the basic concepts of intents, stories and domains.
+
+For this tutorial, you can create a new Rasa project using the CLI:
 
 ```bash
 rasa init
 ```
 
-Let's remove the default content from this bot, so that the `data/nlu.yml`, `data/stories.yml`
-and `domain.yml` files are empty.
+If you want to start from scratch, you should remove the default content from
+the `data/nlu.yml`, `data/stories.yml` and `domain.yml` files.
+
+## Responding to simple messages
 
-<a aria-hidden="true" tabIndex="-1" className="anchor enhancedAnchor" id="respond-with-memoization-policy"></a>
+Responding to single messages requires the assistant to recognize the message and
+trigger a response.
 
-## Memoization Policy
+Rasa Open Source uses what is called a `MemoizationPolicy` to
+predict what it needs to do once it receives such a message.
 
-The MemoizationPolicy remembers examples from training stories for up to a `max_history`
-of turns. One “turn” includes the message sent by the user and any actions the
-assistant performed before waiting for the next message. For the purpose of a simple,
-context-less FAQ bot, we only need to pay attention to the last message the user sent,
-and therefore we'll set that to `1`.
+:::note MemoizationPolicy
+If the `policies` key in your `config.py` is empty (or only contains
+comments) the `MemoizationPolicy` will be added automatically using
+[config suggestions](model-configuration.mdx#suggested-config).
+You do not need to do configure any policies.
 
-You can do this by editing your configuration file as follows
-(you can remove `TEDPolicy` for now):
+If you customized your `policies`, you need to make sure the
+[Memoization Policy](policies.mdx#memoization-policy)
+is part of your configuration:
 
-```yaml title="config.yml"
+```yaml-rasa title="config.yml"
 policies:
 - name: MemoizationPolicy
   max_history: 1
-- name: MappingPolicy
-```
-
-:::note MappingPolicy
-The `MappingPolicy` is there because it handles the logic of the `/restart` intent,
-which allows you to clear the conversation history and start fresh.
-
-:::
-
-Now that we've defined our policies, we can add some stories for the `goodbye`, `thank` and `greet`
-intents to our stories:
-
-```yaml title="data/stories.yml"
-stories:
 
-- story: greet           # name of the story
-  steps:
-  - intent: greet        # intent of the user message
-  - action: utter_greet  # reaction of the bot
-
-- story: thank
-  steps:
-  - intent: thank
-  - action: utter_noworries
-
-- story: goodbye
-  steps:
-  - intent: bye
-  - action: utter_bye
+# ... your other policies
 ```
 
-We'll also need to add the intents, actions and responses to our domain:
-
-```yml title="domain.yml"
-intents:
-  - greet
-  - bye
-  - thank
-
-responses:
-  utter_noworries:
-    - text: No worries!
-
-  utter_greet:
-    - text: Hi
+:::
 
-  utter_bye:
-    - text: Bye!
-```
+### 1. Creating intents
 
-Finally, we'll copy over some user message training data from Sara to train our
-intents (more can be found [here](https://github.com/RasaHQ/rasa-demo/blob/master/data/nlu/nlu.md)):
+The first step is to define the messages you want the bot to handle. You can copy
+over some user message training data from Sara to train your
+intents. Sara is the Rasa assistant that helps users to get started with our
+Rasa products, you can find more training data to use in your projects
+[here](https://github.com/RasaHQ/rasa-demo/blob/master/data/nlu/nlu.md).
 
+Add the training data to your bot, you'll need to add it to the training data files:
 
 ```yaml-rasa title="data/nlu.yml"
 nlu:
@@ -145,6 +107,58 @@ nlu:
     - cheers
 ```
 
+### 2. Writing stories
+
+Now that you've defined your intents, you'll need to add some [stories](stories.mdx)
+for the `goodbye`, `thank` and `greet` intents.
+You can add the following stories to define how the bot will respond to the intents:
+
+```yaml-rasa title="data/stories.yml"
+stories:
+
+- story: greet           # name of the story
+  steps:
+  - intent: greet        # intent of the user message
+  - action: utter_greet  # reaction of the bot
+
+- story: thank
+  steps:
+  - intent: thank
+  - action: utter_noworries
+
+- story: goodbye
+  steps:
+  - intent: bye
+  - action: utter_bye
+```
+
+:::note Lots of intents
+Do you have lots of intents that you want to respond to? In that case you should
+check out [Handling FAQs](./chitchat-faqs.mdx#handling-faqs-using-a-response-selector).
+You'll avoid the need to specify one story for each of your intents.
+:::
+
+You'll need to tie all these pieces together by adding the intents, actions
+and responses to our [domain](domain.mdx):
+
+```yaml-rasa title="domain.yml"
+intents:
+- greet
+- bye
+- thank
+
+responses:
+  utter_noworries:
+  - text: No worries!
+
+  utter_greet:
+  - text: Hi
+
+  utter_bye:
+  - text: Bye!
+```
+
+### 3. Using the bot
 You can now train a first model and test the bot, by running the following commands:
 
 ```bash
@@ -158,14 +172,18 @@ For example:
 
 <img alt="Memoization Policy Conversation" src={useBaseUrl("/img/memoization_policy_convo.png")} />
 
-While it's good to test the bot interactively, we should also add end to end test cases that
-can later be included as part of a [CI/CD system](./setting-up-ci-cd). End-to-end [test conversations](./testing-your-assistant#end-to-end-testing)
-include NLU data, so that both components of Rasa can be tested.
-The file `tests/conversation_tests.md` contains example test conversations. Delete all the test conversations and replace
-them with some test conversations for your assistant so far:
+### 4. Testing the bot
 
-```yaml title="tests/conversation_tests.yml"
-e2e_tests:
+While it's good to test the bot interactively, you should also add story tests that
+can later be included as part of a [CI/CD system](./setting-up-ci-cd).
+[Story tests](./testing-your-assistant#end-to-end-testing)
+include user messages, bot actions and responses. This ensures that
+the dialogue handling is tested as well as the message handling.
+The initial project already contains test conversations, you can replace
+them with some test conversations for your assistant:
+
+```yaml-rasa title="tests/test_stories.yml"
+stories:
 - story: greet and goodybe
   steps:
   - user: |
@@ -204,36 +222,50 @@ e2e_tests:
   - action: utter_bye
 ```
 
-To test our model against the test file, run the command:
+This test file contains three separate test stories. You can
+test your bot on all of them using `rasa test`:
 
 ```bash
-rasa test --stories tests/conversation_tests.yml
+rasa test --stories tests/test_stories.yml
 ```
 
-The test command will produce a directory named `results`. It should contain a file
+The test command will produce a directory named `results`. It will contain a file
 called `failed_stories.yml`, where any test cases that failed will be printed. It will
 also specify whether it was an NLU or Core prediction that went wrong.  As part of a
 CI/CD pipeline, the test option `--fail-on-prediction-errors` can be used to throw
-an exception that stops the pipeline.
+an exception that stops the pipeline:
+
+```bash
+rasa test --stories tests/test_stories.yml --fail-on-prediction-errors
+```
+
+## Handling FAQs using a Response Selector
 
-<a aria-hidden="true" tabIndex="-1" className="anchor enhancedAnchor" id="faqs-response-selector"></a>
+When you need to handle lots of different messages like FAQs or chitchat, the above
+approach using the `MemoizationPolicy` will become cumbersome. You will need to write
+one story for each of the different intents.
 
-## Response Selectors
+The [ResponseSelector](components/selectors.mdx#responseselector) is designed to
+make it easier to handle conversation patterns like small talk and FAQ messages.
+When you use the `ResponseSelector`, you only need one story to handle all FAQs,
+instead of adding one story for each intent.
 
-The [ResponseSelector](components/selectors.mdx#responseselector) NLU component
-is designed to make it easier to handle conversation patterns like small talk and
-FAQ messages in a simple manner. By using the `ResponseSelector`, you only need one
-story to handle all FAQs, instead of adding new stories every time you want to
-increase your bot's scope.
+:::note More details on the Response Selector
+
+This walk through shows you how to use the response selector. If you want to know
+more about how it works under the hood, head over to this [blog post](https://blog.rasa.com/response-retrieval-models/) and the
+[Retrieval Actions](./retrieval-actions.mdx) page.
+:::
+
+### 1. Creating intents
 
 People often ask Sara different questions surrounding the Rasa products, so let's
 start with three intents: `ask_channels`, `ask_languages`, and `ask_rasax`.
-We're going to copy over some user messages from the
-[Sara training data](https://github.com/RasaHQ/rasa-demo/blob/master/data/nlu/nlu.md)
-into our training data. It's important that these intents have an `faq/` prefix, so
-they're recognized as the faq intent by the `ResponseSelector`:
 
-```yml title="data/nlu.yml"
+Similar to the `MemoizationPolicy`, you'll need to define the intents and add some
+training data for them:
+
+```yaml-rasa title="data/nlu.yml"
 nlu:
 - intent: faq/ask_channels
   examples: |
@@ -260,11 +292,19 @@ nlu:
     - Tell me about rasa x
     - tell me what is rasa x
 ```
+:::info FAQ/ prefix
+It's important that these intents have a common prefix that is separated by a `/`.
+E.g. in the above example, all intents share the `faq/` prefix. This is necessary
+for the intents to be recognized by the `ResponseSelector`.
 
-Next, we'll need to define the responses associated with these FAQs in a new
+:::
+
+### 2. Creating responses
+
+Next, you'll need to define the responses associated with these FAQs in a new
 file:
 
-```yaml title="data/responses.yml"
+```yaml-rasa title="data/responses.yml"
 responses:
   faq/ask_channels:
   - text: |
@@ -279,9 +319,11 @@ responses:
   - text: "Rasa X is a tool to learn from real conversations and improve your assistant. Read more [here](https://rasa.com/docs/rasa-x/)"
 ```
 
+### 3. Configuring the bot
+
 The `ResponseSelector` should already be at the end of your pipeline configuration:
 
-```yaml title="config.yml" {14-15}
+```yaml-rasa title="config.yml" {14-15}
 language: en
 pipeline:
   - name: WhitespaceTokenizer
@@ -303,7 +345,7 @@ Now that we've defined the message handling side, we need to make
 the dialogue handling parts aware of these changes. First, we need to add the
 new intents to our domain:
 
-```yaml title="domain.yml"
+```yaml-rasa title="domain.yml"
 intents:
   - greet
   - bye
@@ -316,14 +358,14 @@ which takes care of sending the response predicted from the `ResponseSelector`
 back to the user, to the list of actions. These actions always have to start
 with the `respond_` prefix:
 
-```yaml title="domain.yml"
+```yaml-rasa title="domain.yml"
 actions:
   - respond_faq
 ```
 
 Next we'll write a story so that the dialogue engine knows which action to predict:
 
-```yml title="data/stories.yml"
+```yaml-rasa title="data/stories.yml"
 stories:
 - story: Some question from FAQ
   steps:
@@ -333,6 +375,8 @@ stories:
 
 This prediction is handled by the `MemoizationPolicy`, as we described earlier.
 
+### 4. Using the bot
+
 After all of the changes are done, train a new model and test the modified FAQs:
 
 ```bash
@@ -340,10 +384,12 @@ rasa train
 rasa shell
 ```
 
+### 5. Testing the bot
+
 At this stage it makes sense to add a few test cases for our conversations:
 
-```yaml title="tests/conversation_tests.yml"
-e2e_tests:
+```yaml-rasa title="tests/test_stories.yml"
+stories:
 - story: ask channels
   steps:
   - user: |
@@ -365,25 +411,20 @@ e2e_tests:
     intent: faq
   - action: respond_faq
 ```
-
-You can read more in this [blog post](https://blog.rasa.com/response-retrieval-models/) and the
-[Retrieval Actions](./retrieval-actions.mdx) page.
+### Response Selector Checklist
 
 Using the features we described in this tutorial, you can easily build a context-less assistant.
 
-:::note Checklist
-Here's a minimal checklist of files we modified to build a basic FAQ assistant:
+Here's a **minimal checklist of files you need to modify** to build a basic FAQ assistant:
 
-* `data/nlu.yml`: Add NLU training data for `faq/` intents
+* `data/nlu.yml`: Add user message training data for `faq/` intents
 
 * `data/responses.yml`: Add responses associated with `faq/` intents
 
 * `config.yml`: Add `ResponseSelector` in your NLU pipeline
 
-* `domain.yml`: Add a retrieval action `respond_faq` and intent `faq`
+* `domain.yml`: Add a retrieval action `respond_faq` and the intent `faq`
 
 * `data/stories.yml`: Add a simple story for FAQs
 
-* `tests/conversation_tests.yml`: Add E2E test stories for your FAQs
-
-:::
+* `tests/test_stories.yml`: Add E2E test stories for your FAQs
diff --git a/docs/docs/command-line-interface.mdx b/docs/docs/command-line-interface.mdx
index afe619ba1fb2..4ece80398364 100644
--- a/docs/docs/command-line-interface.mdx
+++ b/docs/docs/command-line-interface.mdx
@@ -25,7 +25,7 @@ The command line interface (CLI) gives you easy-to-remember commands for common
 |`rasa x`                |Launch Rasa X locally.                                                                                                                    |
 |`rasa -h`               |Shows all available commands.                                                                                                             |
 
-## Create a new project
+## rasa init
 
 A single command sets up a complete project for you with some example training data.
 
@@ -42,14 +42,14 @@ This creates the following files:
 ├── config.yml
 ├── credentials.yml
 ├── data
-│   ├── nlu.md
-│   └── stories.md
+│   ├── nlu.yml
+│   └── stories.yml
 ├── domain.yml
 ├── endpoints.yml
 ├── models
 │   └── <timestamp>.tar.gz
 └── tests
-   └── conversation_tests.md
+   └── test_stories.yml
 ```
 
 The `rasa init` command will ask you if you want to train an initial model using this data.
@@ -59,7 +59,7 @@ With this project setup, common commands are very easy to remember.
 To train a model, type `rasa train`, to talk to your model on the command line, `rasa shell`,
 to test your model type `rasa test`.
 
-## Train a Model
+## rasa train
 
 The main command is:
 
@@ -87,7 +87,7 @@ If training data for only one model type is present, the command automatically f
 
 :::
 
-## Interactive Learning
+## rasa interactive
 
 To start an interactive learning session with your assistant, run
 
@@ -114,9 +114,10 @@ The full list of arguments that can be set for `rasa interactive` is:
 ```text [rasa interactive --help]
 ```
 
-## Talk to your Assistant
+## rasa shell
 
-To start a chat session with your assistant on the command line, run:
+This command allows you to talk to your assistant. To start a chat session
+with your assistant on the command line, run:
 
 ```bash
 rasa shell
@@ -149,7 +150,7 @@ The full list of options for `rasa shell` is:
 ```text [rasa shell --help]
 ```
 
-## Start a Server
+## rasa run
 
 To start a server running your Rasa model, run:
 
@@ -175,7 +176,7 @@ The following arguments can be used to configure your Rasa server:
 For more information on the additional parameters, see [Model Storage](./model-storage.mdx).
 See the Rasa [HTTP API](./http-api.mdx) docs for detailed documentation of all the endpoints.
 
-## Start an Action Server
+## rasa run actions
 
 To run your action server run
 
@@ -188,7 +189,7 @@ The following arguments can be used to adapt the server settings:
 ```text [rasa run actions --help]
 ```
 
-## Visualize your Stories
+## rasa visualize
 
 To open a browser tab with a graph showing your stories:
 
@@ -204,7 +205,7 @@ Additional arguments are:
 ```text [rasa visualize --help]
 ```
 
-## Evaluating a Model on Test Data
+## rasa test
 
 To evaluate your model on test data, run:
 
@@ -212,23 +213,38 @@ To evaluate your model on test data, run:
 rasa test
 ```
 
-Specify the model to test using `--model`.
-Check out more details in [Evaluating an NLU Model](./testing-your-assistant.mdx#evaluating-an-nlu-model) and [Evaluating a Core Model](./testing-your-assistant.mdx#evaluating-a-core-model).
+Specify the model to test using `--model`. The above command will test your model
+end-to-end using test stories. You can evaluate the dialogue and nlu
+model separately using
+```bash
+rasa test core
+```
+and
+
+```bash
+rasa test nlu
+```
+
+Check out more details in
+[Evaluating an NLU Model](./testing-your-assistant.mdx#evaluating-an-nlu-model) and
+[Evaluating a Core Model](./testing-your-assistant.mdx#evaluating-a-core-model).
 
 The following arguments are available for `rasa test`:
 
 ```text [rasa test --help]
 ```
 
-## Create a Train-Test Split
+## rasa data split
 
-To create a split of your NLU data, run:
+This commands allows you to create a train-test split of your training data. To
+create a split of your NLU data, run:
 
 ```bash
 rasa data split nlu
 ```
 
-You can specify the training data, the fraction, and the output directory using the following arguments:
+You can specify the training data, the fraction, and the output directory using
+the following arguments:
 
 ```text [rasa data split nlu --help]
 ```
@@ -239,14 +255,25 @@ If you have NLG data for retrieval actions, this will be saved to seperate files
 ```bash
 ls train_test_split
 
-      nlg_test_data.md     test_data.json
-      nlg_training_data.md training_data.json
+      nlg_test_data.yml     test_data.json
+      nlg_training_data.yml training_data.json
 ```
 
-## Convert Data Between Markdown and JSON
+## rasa data convert
+
+You cam convert NLU data from
+- LUIS data format,
+- WIT data format,
+- Dialogflow data format,
+- JSON, or
+- Markdown
+
+to
+- YAML or
+- JSON or
+- Markdown.
 
-To convert NLU data from LUIS data format, WIT data format, Dialogflow data format, JSON, or Markdown
-to JSON or Markdown, run:
+You can start the converter by running:
 
 ```bash
 rasa data convert nlu
@@ -257,7 +284,7 @@ You can specify the input file, output file, and the output format with the foll
 ```text [rasa data convert nlu --help]
 ```
 
-## Export Conversations to an Event Broker
+## rasa export
 
 To export events from a tracker store using an event broker, run:
 
@@ -272,7 +299,7 @@ should be published.
 ```text [rasa export --help]
 ```
 
-## Start Rasa X
+## rasa x
 
 Rasa X is a toolset that helps you leverage conversations to improve your assistant.
 You can find more information about it <a className="reference external" href="https://rasa.com/docs/rasa-x/" target="_blank">here</a>.You can start Rasa X locally by executing
@@ -282,7 +309,9 @@ rasa x
 ```
 
 To be able to start Rasa X you need to have Rasa X local mode installed
-and you need to be in a Rasa project.:::note
+and you need to be in a Rasa project.
+
+:::note
 By default Rasa X runs on the port 5002. Using the argument `--rasa-x-port` allows you to change it to
 any other port.
 
diff --git a/docs/docs/fallback-handoff.mdx b/docs/docs/fallback-handoff.mdx
index bbc03e94495f..3bfa2509e647 100644
--- a/docs/docs/fallback-handoff.mdx
+++ b/docs/docs/fallback-handoff.mdx
@@ -285,11 +285,11 @@ let's say the user asks “I want to apply for a job at Rasa”, we can then rep
 :::note
 Here's a minimal checklist of files we modified to help our assistant fail gracefully:
 
-* `data/nlu.md`:
+* `data/nlu.yml`:
 
     * Add training data for the `out_of_scope` intent & any specific out of scope intents that you want to handle seperately
 
-* `data/stories.md`:
+* `data/stories.yml`:
 
     * Add stories for any specific out of scope intents
 
diff --git a/docs/docs/jupyter-notebooks.mdx b/docs/docs/jupyter-notebooks.mdx
index 5b0b86e68861..2b5d20c09a5f 100644
--- a/docs/docs/jupyter-notebooks.mdx
+++ b/docs/docs/jupyter-notebooks.mdx
@@ -82,7 +82,8 @@ The return values are the paths to these newly created directories.
 
 ```bash
 import rasa.data as data
-stories_directory, nlu_data_directory = data.get_core_nlu_directories(training_files)
+nlu_data_directory = data.get_data_directories(training_files, data.is_nlu_file)
+stories_directory = data.get_data_directories(training_files, data.is_story_file)
 print(stories_directory, nlu_data_directory)
 ```
 To test your model, call the `test` function, passing in the path
@@ -108,5 +109,5 @@ else:
 if os.path.isdir("results"):
       print("\n")
       print("Core Errors:")
-      print(open("results/failed_stories.md").read())
+      print(open("results/failed_test_stories.yml").read())
 ```
diff --git a/docs/docs/policies.mdx b/docs/docs/policies.mdx
index 53b9bb53f1c3..9787693f8c3b 100644
--- a/docs/docs/policies.mdx
+++ b/docs/docs/policies.mdx
@@ -384,10 +384,25 @@ a fixed behavior. Please see [Rules](./rules.mdx) for further information.
 
 ## Memoization Policy
 
-The `MemoizationPolicy` just memorizes the conversations in your
-training data. It predicts the next action with confidence `1.0`
-if this exact conversation exists in the training data, otherwise it
-predicts `None` with confidence `0.0`.
+The `MemoizationPolicy` remembers the stories from your
+training data. It checks if the current conversation matches a story
+in the training data. If so, it will predict the next action from the matching
+story of your training data with a confidence of `1.0`. If no matching conversation
+is found, the policy predicts `None` with confidence `0.0`.
+
+When looking for a match in your training data, the policy will take the last
+`max_history` number of turns of the conversation into account.
+One “turn” includes the message sent by the user and any actions the
+assistant performed before waiting for the next message.
+
+You can configure the number of turns the `MemoizationPolicy` should use in your
+configuration:
+```yaml title="config.yml"
+policies:
+  - name: "MemoizationPolicy"
+    max_history: 3
+```
+
 
 ## Augmented Memoization Policy
 
diff --git a/docs/docs/setting-up-ci-cd.mdx b/docs/docs/setting-up-ci-cd.mdx
index 960fbec395ce..4dbb1f8e9294 100644
--- a/docs/docs/setting-up-ci-cd.mdx
+++ b/docs/docs/setting-up-ci-cd.mdx
@@ -93,7 +93,7 @@ important as you start introducing more complicated stories from user
 conversations.
 
 ```bash
-rasa test --stories tests/conversation_tests.md --fail-on-prediction-errors
+rasa test --stories tests/test_stories.yml --fail-on-prediction-errors
 ```
 
 The `--fail-on-prediction-errors` flag ensures the test will fail if any test
@@ -106,7 +106,7 @@ to be representative of the true distribution of real conversations.
 Rasa X makes it easy to [add test conversations based on real conversations](https://rasa.com/docs/rasa-x/user-guide/test-assistant/#how-to-create-tests).
 
 Note: End-to-end testing does **not** execute your action code. You will need to
-[test your action code](./setting-up-ci-cd.mdx#testing-action-code) in a seperate step.
+[test your action code](./setting-up-ci-cd.mdx#testing-action-code) in a separate step.
 
 ### Compare NLU Performance
 
diff --git a/docs/docs/testing-your-assistant.mdx b/docs/docs/testing-your-assistant.mdx
index 2e581c34bdb3..3ba4df183f73 100644
--- a/docs/docs/testing-your-assistant.mdx
+++ b/docs/docs/testing-your-assistant.mdx
@@ -7,31 +7,35 @@ description: Test your Rasa Open Source assistant to validate and improve your c
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 
+Rasa Open Source lets you test dialogues end-to-end by running through
+test stories. The test makes sure that user messages are processed correctly
+and the dialogue predictions are correct. In addition to end-to-end tests, you can
+also test the dialogue handling (core) and the message processing (nlu)
+separately.
+
 :::note
 If you are looking to tune the hyperparameters of your NLU model,
 check out this [tutorial](https://blog.rasa.com/rasa-nlu-in-depth-part-3-hyperparameters/).
 
 :::
 
-
 ## End-to-End Testing
 
-Rasa Open Source lets you test dialogues end-to-end by running through
-test conversations and making sure that both NLU and Core make correct predictions.
+Testing your assistant requires you to write test stories, which include
+the user messages and the conversation history. The format is the same as the one
+used to specify stories in your training data.
 
-To do this, you need some stories in the end-to-end format,
-which includes both the NLU output and the original text.
 Here are some examples:
 
 <Tabs values={[{"label": "Basics", "value": "basics"}, {"label": "Custom Actions", "value": "customactions"}, {"label": "Forms Happy Path", "value": "formshappypath"}, {"label": "Forms Unhappy Path", "value": "formsunhappypath"}]} defaultValue="basics">
   <TabItem value="basics">
 
-  ```yaml-rasa
-  e2e_tests:
+  ```yaml-rasa title="tests/test_stories.yml"
+  stories:
   - story: A basic end-to-end test
     steps:
     - user: |
-       hey
+        hello
       intent: greet
     - action: utter_ask_howcanhelp
     - user: |
@@ -39,41 +43,41 @@ Here are some examples:
       intent: inform
     - action: utter_ask_location
     - user: |
-       in [Paris]{"entity": "location"}
+        in [Paris]{"entity": "location"}
       intent: inform
     - action: utter_ask_price
   ```
 
   </TabItem>
   <TabItem value="customactions">
-  
-  ```yaml-rasa
-  e2e_tests:
-  - story: A test where a custom action returns events 
+
+  ```yaml-rasa title="tests/test_stories.yml"
+  stories:
+  - story: A test where a custom action returns events
     steps:
     - user: |
-       hey
+        hey
       intent: greet
     - action: my_custom_action
     - slot_was_set:
-      - my_slot: "value added by custom action" 
-    - action: 
-      - utter_ask_age
+      - my_slot: "value added by custom action"
+    - action: utter_ask_age
     - user: |
-        thanks 
+        thanks
       intent: thankyou
     - action: utter_no_worries
   ```
 
+
   </TabItem>
   <TabItem value="formshappypath">
-  
-  ```yaml-rasa
-  e2e_tests:
+
+  ```yaml-rasa title="tests/test_stories.yml"
+  stories:
   - story: A test conversation with a form
     steps:
     - user: |
-       hi
+        hi
       intent: greet
     - action: utter_greet
     - user: |
@@ -82,41 +86,41 @@ Here are some examples:
     - action: restaurant_form
     - active_loop: restaurant_form
     - user: |
-       [afghan](cuisine) food
+        [afghan](cuisine) food
       intent: inform
     - action: restaurant_form
     - active_loop: null
     - action: utter_slots_values
     - user: |
-        thanks 
+        thanks
       intent: thankyou
     - action: utter_no_worries
   ```
 
   </TabItem>
   <TabItem value="formsunhappypath">
-  
-  ```yaml-rasa
-  e2e_tests:
+
+  ```yaml-rasa title="tests/test_stories.yml"
+  stories:
   - story: A test conversation with unexpected input during a form
     steps:
     - user: |
-       hi
+        hi
       intent: greet
     - action: utter_greet
     - user: |
-       im looking for a restaurant
+        im looking for a restaurant
       intent: request_restaurant
     - action: restaurant_form
     - active_loop: restaurant_form
     - user: |
-       can you share your boss with me? 
+        can you share your boss with me?
     - action: utter_chitchat
     - action: restaurant_form
     - active_loop: null
     - action: utter_slots_values
     - user: |
-        thanks 
+        thanks
       intent: thankyou
     - action: utter_no_worries
   ```
@@ -124,34 +128,30 @@ Here are some examples:
   </TabItem>
 </Tabs>
 
-By default Rasa Open Source saves conversation tests to `tests/conversation_tests.md`.
+By default Rasa Open Source saves conversation tests to `tests/test_stories.yml`.
 You can test your assistant against them by running:
 
 ```bash
 rasa test
 ```
 
-:::note
-[Custom Actions](./actions.mdx#custom-actions) are **not executed as part of end-to-end tests.** If your custom
-actions append any events to the tracker, this has to be reflected in your end-to-end
-tests (e.g. by adding `slot` events to your end-to-end story).
+The command will always load all stories from any story files, where the file
+name starts with `test_`, e.g. `test_stories.yml`. Your story test
+file names should always start with `test_` for this detection to work.
 
-To test your custom actions, write unit tests for them and include these
-tests in your CI/CD pipeline.
+:::info Custom Actions
+[Custom Actions](./custom-actions.mdx) are **not executed as part of end-to-end tests.** If your custom
+actions append any events to the conversation, this has to be reflected in your end-to-end
+tests (e.g. by adding `slot_was_set` events to your end-to-end story).
+
+If you want to test the code of your custom actions, you should write unit tests
+for them and include these tests in your CI/CD pipeline.
 
 :::
 
 If you have any questions or problems, please share them with us in the dedicated
 [testing section on our forum](https://forum.rasa.com/tags/testing) !
 
-:::note
-Make sure your model file in `models` is a combined `core`
-and `nlu` model. If it does not contain an NLU model, Core will use
-the default `RegexInterpreter`.
-
-:::
-
-
 ## Evaluating an NLU Model
 
 A standard technique in machine learning is to keep some data separate as a *test set*.
@@ -162,10 +162,11 @@ into train and test sets using:
 rasa data split nlu
 ```
 
-If you've done this, you can see how well your NLU model predicts the test cases using this command:
+If you've done this, you can see how well your NLU model predicts the
+test cases:
 
 ```bash
-rasa test nlu -u train_test_split/test_data.md --model models/nlu-20180323-145833.tar.gz
+rasa test nlu --nlu train_test_split/test_data.yml
 ```
 
 If you don't want to create a separate test set, you can
@@ -173,13 +174,11 @@ still estimate how well your model generalises using cross-validation.
 To do this, add the flag `--cross-validation`:
 
 ```bash
-rasa test nlu -u data/nlu.md --config config.yml --cross-validation
+rasa test nlu --nlu data/nlu.yml --cross-validation
 ```
 
-The full list of options for the script is:
-
-```text [rasa test nlu --help]
-```
+You can find the full list of options in the
+[CLI documentation on rasa test](command-line-interface.mdx#rasa-test)
 
 ### Comparing NLU Pipelines
 
@@ -188,7 +187,7 @@ a comparative examination between the pipelines.
 
 ```bash
 rasa test nlu --config pretrained_embeddings_spacy.yml supervised_embeddings.yml
-  --nlu data/nlu.md --runs 3 --percentages 0 25 50 70 90
+  --nlu data/nlu.yml --runs 3 --percentages 0 25 50 70 90
 ```
 
 The command in the example above will create a train/test split from your data,
@@ -278,10 +277,10 @@ You can evaluate your trained model on a set of test stories
 by using the evaluate script:
 
 ```bash
-rasa test core --stories test_stories.md --out results
+rasa test core --stories test_stories.yml --out results
 ```
 
-This will print the failed stories to `results/failed_stories.md`.
+This will print the failed stories to `results/failed_test_stories.yml`.
 We count any story as failed if at least one of the actions
 was predicted incorrectly.
 
diff --git a/docs/docs/training-data-format.mdx b/docs/docs/training-data-format.mdx
index 74476a4484a9..0c66a2e0133b 100644
--- a/docs/docs/training-data-format.mdx
+++ b/docs/docs/training-data-format.mdx
@@ -57,6 +57,18 @@ Here's a short example which keeps all training data in a single file:
 ```yaml-rasa
 version: "2.0"
 
+nlu:
+- intent: greet
+  examples: |
+    - Hey
+    - Hi
+    - hey there [Sara](name)
+
+- intent: faq/language
+  examples: |
+    - What language do you speak?
+    - Do you only handle english?
+
 stories:
 - story: greet and faq
   steps:
@@ -71,21 +83,13 @@ rules:
   - intent: greet
   - action: utter_greet
 
-nlu:
-- intent: greet
-  examples: |
-    - Hey
-    - Hi
-    - hey there [Sara](name)
-
-- intent: faq/language
-  examples: |
-    - What language do you speak?
-    - Do you only handle english?
+```
 
-e2e_tests:
+If you want to specify your test stories, you need to put them into a separate file:
+```yaml-rasa title="tests/test_stories.yml"
+stories:
 - story: greet and ask language
-- steps: 
+- steps:
   - user: |
       hey
     intent: greet
@@ -95,7 +99,7 @@ e2e_tests:
     intent: faq/language
   - action: respond_faq
 ```
-
+They use the same format as the story training data.
 
 ## NLU Training Data
 
@@ -103,7 +107,7 @@ e2e_tests:
 **intent**, i.e. what the user is trying to convey or accomplish with their
 message. Training examples can also include **entities**. Entities are structured
 pieces of information that can be extracted from a user's message. You can also
-add extra information such as regular expressions and lookup tables to your 
+add extra information such as regular expressions and lookup tables to your
 training data to help the model identify intents and entities correctly.
 
 NLU training data is defined under the `nlu` key. Items that can be added under this key are:
@@ -122,7 +126,7 @@ nlu:
 - [Synonyms](#synonyms)
 
 ```yaml-rasa
-nlu: 
+nlu:
 - synonym: credit
   examples: |
     - credit card account
@@ -169,7 +173,7 @@ However, it's also possible to use an extended format if you have a custom NLU c
 ```yaml-rasa
 nlu:
 - intent: greet
-  examples: 
+  examples:
   - text: |
       hi
     metadata:
@@ -178,14 +182,14 @@ nlu:
       hey there!
 ```
 
-The `metadata` key can contain arbitrary key-value data that stays with an example and is accessible by the components in the NLU pipeline. In the example above, the sentiment of 
+The `metadata` key can contain arbitrary key-value data that stays with an example and is accessible by the components in the NLU pipeline. In the example above, the sentiment of
 the example could be used by a custom component in the pipeline for sentiment analysis.
 
 ### Entities
 
 [Entities](glossary.mdx#entity) are structured pieces of information that can be extracted from a user's message. For entity extraction to work, you need to either specify training data to train an ML model or you need to define [regular expressions](#regular-expressions-for-entity-extraction) to extract entities using the [`RegexEntityExtractor`](components/entity-extractors.mdx#regexentityextractor) based on a character pattern.
 
-Entities are annotated in training examples with minimally the entity's name. 
+Entities are annotated in training examples with minimally the entity's name.
 In addition to the entity name, you can annotate an entity with synonyms, roles, or groups.
 
 In training examples, entity annotation would look like this:
@@ -200,7 +204,7 @@ nlu:
 
 ```
 
-The full possible syntax for annotating an entity is: 
+The full possible syntax for annotating an entity is:
 
 ```text
 [<entity-text>]{"entity": "<entity name>", "role": "<role name>", "group": "<group name>", "value": "<entity synonym>"}
@@ -243,7 +247,7 @@ pipeline in your configuration file contains the
 should define synonyms when there are multiple ways users refer to the same
 thing.
 
-#### Example 
+#### Example
 
 Let's say you had an entity `account`, and you expect the
 value "credit". Your users also refer to their "credit" account as "credit
@@ -264,16 +268,16 @@ recognize these as entities and replace them with `credit`.
 
 ### Regular Expressions
 
-You can use regular expressions to improve intent classification and 
+You can use regular expressions to improve intent classification and
 entity extraction in combination with the [`RegexFeaturizer`](components/featurizers.mdx#regexfeaturizer) and [`RegexEntityExtractor`](components/entity-extractors.mdx#regexentityextractor) components in the pipeline.
 
-#### Regular Expressions for Intent Classification 
+#### Regular Expressions for Intent Classification
 
-You can use regular expressions to improve intent classification by including the `RegexFeaturizer` component in your pipeline. When using the `RegexFeaturizer`, a regex does not act as a rule for classifying an intent. It only provides a feature that the intent classifier will use 
+You can use regular expressions to improve intent classification by including the `RegexFeaturizer` component in your pipeline. When using the `RegexFeaturizer`, a regex does not act as a rule for classifying an intent. It only provides a feature that the intent classifier will use
 to learn patterns for intent classification.
-Currently, all intent classifiers make use of available regex features. 
+Currently, all intent classifiers make use of available regex features.
 
-The name of a regex in this case is a human readable description. It can help you remember what a regex is used for, and it is the title of the corresponding pattern feature. It does not have to match any intent or entity name. A regex for greeting might look like this: 
+The name of a regex in this case is a human readable description. It can help you remember what a regex is used for, and it is the title of the corresponding pattern feature. It does not have to match any intent or entity name. A regex for greeting might look like this:
 
 ```yaml-rasa
 nlu:
@@ -303,7 +307,7 @@ If your entity has a deterministic structure (like an account number), you can u
   `DIETClassifier` component. Other entity extractors, like
   `MitieEntityExtractor` or `SpacyEntityExtractor`, won't use the generated
   features and their presence will not improve entity recognition for
-  these extractors. 
+  these extractors.
 
 2. For rule-based entity extraction using the [`RegexEntityExtractor`](components/entity-extractors.mdx#regexentityextractor) component in the pipeline.
 
@@ -326,7 +330,7 @@ nlu:
 When using the `RegexFeaturizer`, a regular expression only provides a feature
 that helps the model learn an association between intents/entities and inputs
 that fit the regular expression. In order for the model to learn this association,
-you must provide example inputs that include that regular expression! 
+you must provide example inputs that include that regular expression!
 :::
 
 
@@ -379,16 +383,16 @@ model. [**Stories**](stories.mdx) are used to train a machine learning model
 to identify patterns in conversations and generalize to unseen conversation paths.
 **[Rules](rules.mdx)** describe parts of conversations that should always
 follow the same path and are used to train the
-[RulePolicy](policies.mdx#rule-policy). 
+[RulePolicy](policies.mdx#rule-policy).
 
 
 ### Stories
 
 Stories are composed of:
-  
-  - `story`: The story's name. The name is arbitrary and not used in training; 
+
+  - `story`: The story's name. The name is arbitrary and not used in training;
     you can use it as a human-readable reference for the story.
-  - `metadata`: arbitrary and optional, not used in training, 
+  - `metadata`: arbitrary and optional, not used in training,
     you can use it to store relevant information about the story
     like e.g. the author
   - a list of `steps`: The user messages and actions that make up the story
@@ -431,7 +435,7 @@ messages the users can send to mean the same thing.
 User messages follow the format:
 
 ```yaml-rasa
-stories: 
+stories:
 - story: user message structure
   steps:
     - intent: intent_name  # Required
@@ -505,8 +509,8 @@ A [form](glossary.mdx#form) is a specific kind of custom action that contains th
 a set of required slots and ask the user for this information. You
 [define a form](forms.mdx#defining-a-form) in the `forms` section in your domain.
 Once defined, the [happy path](glossary.mdx#happy-unhappy-paths)
-for a form should be specified as a [rule](forms.mdx), but interruptions of forms or 
-other "unhappy paths" should be included in stories so that the model can 
+for a form should be specified as a [rule](forms.mdx), but interruptions of forms or
+other "unhappy paths" should be included in stories so that the model can
 generalize to unseen conversation sequences.
 As a step in a story, a form takes the following basic format:
 
@@ -527,10 +531,10 @@ The `action` step activates the form and begins looping over the required slots.
 step indicates that there is a currently active form. Much like a `slot_was_set` step,
 a `form` step doesn't **set** a form to active but indicates that it should already be activated.
 In the same way, the  `active_loop: null` step indicates that no form should be active before the subsequent
-steps are taken. 
+steps are taken.
 
-A form can be interrupted and remain active; in this case the interruption should come after the 
-`action: <form to activate>` step and be followed by the `active_loop: <active form>` step. 
+A form can be interrupted and remain active; in this case the interruption should come after the
+`action: <form to activate>` step and be followed by the `active_loop: <active form>` step.
 An interruption of a form could look like this:
 
 ```yaml-rasa
@@ -549,10 +553,10 @@ stories:
 
 #### Slots
 
-A slot event is specified under the key `slot_was_set:` with the 
+A slot event is specified under the key `slot_was_set:` with the
 slot name and optionally the slot's value.
 
-**[Slots](domain.mdx#slots)** act as the bots memory. 
+**[Slots](domain.mdx#slots)** act as the bots memory.
 Slots are **set** by entities or by custom actions and **referenced**
 by stories in `slot_was_set` steps. For example:
 
@@ -594,7 +598,7 @@ action **before** the `slot_was_set` step.
 #### Checkpoints
 
 Checkpoints are specified with the `checkpoint:` key, either at the beginning
-or the end of a story. 
+or the end of a story.
 
 
 Checkpoints are ways to connect stories together. They can be either the first
@@ -627,7 +631,7 @@ stories:
   steps:
   - checkpoint: greet_checkpoint
     # This checkpoint should only apply if slots are set to the specified value
-    slots: 
+    slots:
     - context_scenario: holiday
     - holiday_name: thanksgiving
   - intent: greet
@@ -639,7 +643,7 @@ Checkpoints can help simplify your training data and reduce redundancy in it,
 but **do not overuse them**. Using lots of checkpoints can quickly make your
 stories hard to understand. It makes sense to use them if a sequence of steps
 is repeated often in different stories, but stories without checkpoints
-are easier to read and write. 
+are easier to read and write.
 
 #### OR statement
 
@@ -693,22 +697,16 @@ Read more about writing rules [here](rules.mdx#writing-a-rule).
 
 ## Test Conversations
 
-Test conversations combine both NLU and Core training data into a end-to-end story
-for evaluation.
-
-:::info Test Only
-This format is only used for end-to-end evaluation and cannot be used for training.
-:::
-
-Test conversations are listed under the `e2e_tests` key. 
-Their format is similar to the [story](#stories) format,
-except that user message steps can include a `user` key which specifies the actual
-text and entity annotation of the user message.
+Test conversations use stories to test your bot. The tests check if a message is
+classified correctly as well as the action predictions.
 
-Here's an example of a test conversation:
+Test stories use the same format as [stories](#stories),
+except that user message steps can include a `user` to specify the actual
+text and entity annotations of the user message. Here's an example of a
+test conversation:
 
 ```yaml-rasa
-e2e_tests:
+stories:
 - story: A basic end-to-end test
   steps:
   - user: |
@@ -724,3 +722,11 @@ e2e_tests:
     intent: inform
   - action: utter_ask_price
 ```
+
+Running the tests can be done using the CLI:
+```bash
+rasa test
+```
+
+If you want to know more about testing head over to
+[Testing Your Assistant](testing-your-assistant.mdx).
diff --git a/docs/docs/unexpected-input.mdx b/docs/docs/unexpected-input.mdx
index 743d4c2ce05d..31ceb9b8394c 100644
--- a/docs/docs/unexpected-input.mdx
+++ b/docs/docs/unexpected-input.mdx
@@ -220,7 +220,7 @@ Here's a minimal checklist of  of files we modified to handle unexpected user in
 
 * `actions.py`: Define `action_greet`
 
-* `data/nlu.md`: Add training data for an `explain` intent
+* `data/nlu.yml`: Add training data for an `explain` intent
 
 * `domain.yml`:
 
@@ -232,7 +232,7 @@ Here's a minimal checklist of  of files we modified to handle unexpected user in
 
     * Add responses for contextual question interruptions
 
-* `data/stories.md`:
+* `data/stories.yml`:
 
     * Remove stories using mapped intents if you have them
 
diff --git a/examples/formbot/README.md b/examples/formbot/README.md
index c95a7e940059..b0fc8e41bce2 100644
--- a/examples/formbot/README.md
+++ b/examples/formbot/README.md
@@ -10,8 +10,8 @@ restaurants based on user preferences.
 This example contains some training data and the main files needed to build an
 assistant on your local machine. The `formbot` consists of the following files:
 
-- **data/nlu.md** contains training examples for the NLU model  
-- **data/stories.md** contains training stories for the Core model
+- **data/nlu.yml** contains training examples for the NLU model  
+- **data/stories.yml** contains training stories for the Core model
 - **actions.py** contains the implementation of a custom `FormAction`
 - **config.yml** contains the model configuration
 - **domain.yml** contains the domain of the assistant  
diff --git a/examples/formbot/tests/end-to-end-stories.md b/examples/formbot/tests/end-to-end-stories.md
deleted file mode 100644
index 1b9c4700514d..000000000000
--- a/examples/formbot/tests/end-to-end-stories.md
+++ /dev/null
@@ -1,37 +0,0 @@
-## Happy path
-* greet: hi
-    - utter_greet
-* request_restaurant: im looking for a restaurant
-    - restaurant_form
-    - form{"name": "restaurant_form"}
-    - form{"name": null}
-    - utter_slots_values
-* thankyou: thanks
-    - utter_noworries
-
-## Happy path with message providing requested value
-* greet: hi
-    - utter_greet
-* request_restaurant: im looking for a restaurant
-    - restaurant_form
-    - form{"name": "restaurant_form"}
-* inform: [afghan](cuisine) food
-    - restaurant_form
-    - form{"name": null}
-    - utter_slots_values
-* thankyou: thanks
-    - utter_noworries
- 
-## unhappy path
-* greet: hi
-    - utter_greet
-* request_restaurant: im looking for a restaurant
-    - restaurant_form
-    - form{"name": "restaurant_form"}
-* chitchat: can you share your boss with me?
-    - utter_chitchat
-    - restaurant_form
-    - form{"name": null}
-    - utter_slots_values
-* thankyou: thanks
-    - utter_noworries
\ No newline at end of file
diff --git a/examples/formbot/tests/test_stories.yml b/examples/formbot/tests/test_stories.yml
new file mode 100644
index 000000000000..10d26db64d27
--- /dev/null
+++ b/examples/formbot/tests/test_stories.yml
@@ -0,0 +1,63 @@
+stories:
+- story: Happy path
+  steps:
+  - user: |
+      hi
+    intent: greet
+  - action: utter_greet
+  - user: |
+      im looking for a restaurant
+    intent: request_restaurant
+  - action: restaurant_form
+  - active_loop: restaurant_form
+  - active_loop: null
+  - action: utter_slots_values
+  - user: |
+      thanks
+    intent: thankyou
+  - action: utter_noworries
+
+- story: Happy path with message providing requested value
+  steps:
+  - user: |
+      hi
+    intent: greet
+  - action: utter_greet
+  - user: |
+      im looking for a restaurant
+    intent: request_restaurant
+  - action: restaurant_form
+  - active_loop: restaurant_form
+  - user: |
+      [afghan](cuisine) food
+    intent: inform
+  - action: restaurant_form
+  - active_loop: null
+  - action: utter_slots_values
+  - user: |
+      thanks
+    intent: thankyou
+  - action: utter_noworries
+
+- story: unhappy path
+  steps:
+  - user: |
+      hi
+    intent: greet
+  - action: utter_greet
+  - user: |
+      im looking for a restaurant
+    intent: request_restaurant
+  - action: restaurant_form
+  - active_loop: restaurant_form
+  - user: |
+      can you share your boss with me?
+    intent: chitchat
+  - action: utter_chitchat
+  - action: restaurant_form
+  - active_loop: null
+  - action: utter_slots_values
+  - user: |
+      thanks
+    intent: thankyou
+  - action: utter_noworries
diff --git a/examples/knowledgebasebot/README.md b/examples/knowledgebasebot/README.md
index 83dcfb06fb82..2a0e271718b1 100644
--- a/examples/knowledgebasebot/README.md
+++ b/examples/knowledgebasebot/README.md
@@ -7,8 +7,8 @@ This example bot uses a knowledge base to answer user's requests.
 This example contains some training data and the main files needed to build an 
 assistant on your local machine. The `knowledgebasebot` consists of the following files:
 
-- **data/nlu.md** contains training examples for the NLU model  
-- **data/stories.md** contains training stories for the Core model  
+- **data/nlu.yml** contains training examples for the NLU model  
+- **data/stories.yml** contains training stories for the Core model  
 - **actions.py** contains the custom action for querying the knowledge base
 - **config.yml** contains the model configuration
 - **domain.yml** contains the domain of the assistant  
diff --git a/examples/moodbot/README.md b/examples/moodbot/README.md
index 426358e66cd3..61578fb6369a 100644
--- a/examples/moodbot/README.md
+++ b/examples/moodbot/README.md
@@ -7,8 +7,8 @@ The `moodbot` example simulates how you can use your bot on different channels.
 This example contains some training data and the main files needed to build an
 assistant on your local machine. The `moodbot` consists of the following files:
 
-- **data/nlu.md** contains training examples for the NLU model  
-- **data/stories.md** contains training stories for the Core model  
+- **data/nlu.yml** contains training examples for the NLU model  
+- **data/stories.yml** contains training stories for the Core model  
 - **config.yml** contains the model configuration
 - **domain.yml** contains the domain of the assistant  
 - **credentials.yml** contains credentials for the different channels
diff --git a/rasa/cli/data.py b/rasa/cli/data.py
index 9153a25e37db..e05fc721beb5 100644
--- a/rasa/cli/data.py
+++ b/rasa/cli/data.py
@@ -9,7 +9,6 @@
 from rasa.cli.arguments import data as arguments
 import rasa.cli.utils
 from rasa.constants import DEFAULT_DATA_PATH
-from rasa.core.interpreter import RegexInterpreter
 from rasa.core.training.story_reader.markdown_story_reader import MarkdownStoryReader
 from rasa.core.training.story_writer.yaml_story_writer import YAMLStoryWriter
 from rasa.nlu.convert import convert_training_data
@@ -51,8 +50,6 @@ def add_subparser(
 def _add_data_convert_parsers(
     data_subparsers, parents: List[argparse.ArgumentParser]
 ) -> None:
-    from rasa.nlu import convert
-
     convert_parser = data_subparsers.add_parser(
         "convert",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
@@ -142,6 +139,11 @@ def _append_story_structure_arguments(parser: argparse.ArgumentParser) -> None:
 
 
 def split_nlu_data(args: argparse.Namespace) -> None:
+    """Load data from a file path and split the NLU data into test and train examples.
+
+    Args:
+        args: Commandline arguments
+    """
     from rasa.nlu.training_data.loading import load_data
     from rasa.nlu.training_data.util import get_file_format
 
@@ -158,8 +160,7 @@ def split_nlu_data(args: argparse.Namespace) -> None:
 
 
 def validate_files(args: argparse.Namespace, stories_only: bool = False) -> None:
-    """
-    Validates either the story structure or the entire project.
+    """Validates either the story structure or the entire project.
 
     Args:
         args: Commandline arguments
@@ -186,6 +187,11 @@ def validate_files(args: argparse.Namespace, stories_only: bool = False) -> None
 
 
 def validate_stories(args: argparse.Namespace) -> None:
+    """Validates that training data file content conforms to training data spec.
+
+    Args:
+        args: Commandline arguments
+    """
     validate_files(args, stories_only=True)
 
 
@@ -281,7 +287,7 @@ def _write_nlu_yaml(
 def _write_core_yaml(
     training_data_path: Path, output_path: Path, source_path: Path
 ) -> None:
-    reader = MarkdownStoryReader(RegexInterpreter())
+    reader = MarkdownStoryReader()
     writer = YAMLStoryWriter()
 
     loop = asyncio.get_event_loop()
diff --git a/rasa/cli/initial_project/tests/conversation_tests.md b/rasa/cli/initial_project/tests/conversation_tests.md
deleted file mode 100644
index d7bcbfcbfe4e..000000000000
--- a/rasa/cli/initial_project/tests/conversation_tests.md
+++ /dev/null
@@ -1,51 +0,0 @@
-#### This file contains tests to evaluate that your bot behaves as expected.
-#### If you want to learn more, please see the docs: https://rasa.com/docs/rasa/user-guide/testing-your-assistant/
-
-## happy path 1
-* greet: hello there!
-  - utter_greet
-* mood_great: amazing
-  - utter_happy
-
-## happy path 2
-* greet: hello there!
-  - utter_greet
-* mood_great: amazing
-  - utter_happy
-* goodbye: bye-bye!
-  - utter_goodbye
-
-## sad path 1
-* greet: hello
-  - utter_greet
-* mood_unhappy: not good
-  - utter_cheer_up
-  - utter_did_that_help
-* affirm: yes
-  - utter_happy
-
-## sad path 2
-* greet: hello
-  - utter_greet
-* mood_unhappy: not good
-  - utter_cheer_up
-  - utter_did_that_help
-* deny: not really
-  - utter_goodbye
-
-## sad path 3
-* greet: hi
-  - utter_greet
-* mood_unhappy: very terrible
-  - utter_cheer_up
-  - utter_did_that_help
-* deny: no
-  - utter_goodbye
-
-## say goodbye
-* goodbye: bye-bye!
-  - utter_goodbye
-
-## bot challenge
-* bot_challenge: are you a bot?
-  - utter_iamabot
diff --git a/rasa/cli/initial_project/tests/test_stories.yml b/rasa/cli/initial_project/tests/test_stories.yml
new file mode 100644
index 000000000000..d4e567ab9b16
--- /dev/null
+++ b/rasa/cli/initial_project/tests/test_stories.yml
@@ -0,0 +1,91 @@
+#### This file contains tests to evaluate that your bot behaves as expected.
+#### If you want to learn more, please see the docs: https://rasa.com/docs/rasa/user-guide/testing-your-assistant/
+
+stories:
+- story: happy path 1
+  steps:
+  - user: |
+      hello there!
+    intent: greet
+  - action: utter_greet
+  - user: |
+      amazing
+    intent: mood_great
+  - action: utter_happy
+
+- story: happy path 2
+  steps:
+  - user: |
+      hello there!
+    intent: greet
+  - action: utter_greet
+  - user: |
+      amazing
+    intent: mood_great
+  - action: utter_happy
+  - user: |
+      bye-bye!
+    intent: goodbye
+  - action: utter_goodbye
+
+- story: sad path 1
+  steps:
+  - user: |
+      hello
+    intent: greet
+  - action: utter_greet
+  - user: |
+      not good
+    intent: mood_unhappy
+  - action: utter_cheer_up
+  - action: utter_did_that_help
+  - user: |
+      yes
+    intent: affirm
+  - action: utter_happy
+
+- story: sad path 2
+  steps:
+  - user: |
+      hello
+    intent: greet
+  - action: utter_greet
+  - user: |
+      not good
+    intent: mood_unhappy
+  - action: utter_cheer_up
+  - action: utter_did_that_help
+  - user: |
+      not really
+    intent: deny
+  - action: utter_goodbye
+
+- story: sad path 3
+  steps:
+  - user: |
+      hi
+    intent: greet
+  - action: utter_greet
+  - user: |
+      very terrible
+    intent: mood_unhappy
+  - action: utter_cheer_up
+  - action: utter_did_that_help
+  - user: |
+      no
+    intent: deny
+  - action: utter_goodbye
+
+- story: say goodbye
+  steps:
+  - user: |
+      bye-bye!
+    intent: goodbye
+  - action: utter_goodbye
+
+- story: bot challenge
+  steps:
+  - user: |
+      are you a bot?
+    intent: bot_challenge
+  - action: utter_iamabot
diff --git a/rasa/cli/test.py b/rasa/cli/test.py
index 0666da94adf2..b5341a8173e6 100644
--- a/rasa/cli/test.py
+++ b/rasa/cli/test.py
@@ -8,7 +8,6 @@
     DEFAULT_CONFIG_PATH,
     DEFAULT_DATA_PATH,
     DEFAULT_E2E_TESTS_PATH,
-    DEFAULT_ENDPOINTS_PATH,
     DEFAULT_MODELS_PATH,
     DEFAULT_RESULTS_PATH,
     CONFIG_SCHEMA_FILE,
@@ -63,11 +62,8 @@ def run_core_test(args: argparse.Namespace) -> None:
     from rasa import data
     from rasa.test import test_core_models_in_directory, test_core, test_core_models
 
-    endpoints = cli_utils.get_validated_path(
-        args.endpoints, "endpoints", DEFAULT_ENDPOINTS_PATH, True
-    )
     stories = cli_utils.get_validated_path(args.stories, "stories", DEFAULT_DATA_PATH)
-    stories = data.get_core_directory(stories)
+    stories = data.get_test_directory(stories)
     output = args.out or DEFAULT_RESULTS_PATH
     args.errors = not args.no_errors
 
@@ -87,7 +83,6 @@ def run_core_test(args: argparse.Namespace) -> None:
             test_core(
                 model=model_path,
                 stories=stories,
-                endpoints=endpoints,
                 output=output,
                 additional_arguments=vars(args),
             )
diff --git a/rasa/cli/utils.py b/rasa/cli/utils.py
index 043daa32f337..8c9c4529b1f3 100644
--- a/rasa/cli/utils.py
+++ b/rasa/cli/utils.py
@@ -88,7 +88,7 @@ def cancel_cause_not_found(
         "The path '{}' does not exist. Please make sure to {}specify it"
         " with '--{}'.".format(current, default_clause, parameter)
     )
-    exit(1)
+    sys.exit(1)
 
 
 def parse_last_positional_argument_as_model_path() -> None:
@@ -174,7 +174,7 @@ def element_to_string(element: Dict[Text, Any], idx: int = 0) -> Text:
 
 def button_choices_from_message_data(
     message: Dict[Text, Any], allow_free_text_input: bool = True
-) -> "Question":
+) -> List[Text]:
     """Return list of choices to present to the user.
 
     If allow_free_text_input is True, an additional option is added
diff --git a/rasa/constants.py b/rasa/constants.py
index 154c80c2f32d..ba2365982e6d 100644
--- a/rasa/constants.py
+++ b/rasa/constants.py
@@ -15,6 +15,7 @@
 DEFAULT_REQUEST_TIMEOUT = 60 * 5  # 5 minutes
 DEFAULT_RESPONSE_TIMEOUT = 60 * 60  # 1 hour
 
+TEST_STORIES_FILE_PREFIX = "test_"
 TEST_DATA_FILE = "test.md"
 TRAIN_DATA_FILE = "train.md"
 NLG_DATA_FILE = "responses.md"
@@ -38,6 +39,7 @@
 DOCS_URL_DOMAINS = DOCS_BASE_URL + "/core/domains/"
 DOCS_URL_STORIES = DOCS_BASE_URL + "/core/stories/"
 DOCS_URL_RULES = DOCS_BASE_URL + "/core/rules/"
+DOCS_URL_TEST_STORIES = DOCS_BASE_URL + "/testing-your-assistant"
 DOCS_URL_ACTIONS = DOCS_BASE_URL + "/core/actions/"
 DOCS_URL_CONNECTORS = DOCS_BASE_URL + "/user-guide/connectors/"
 DOCS_URL_EVENT_BROKERS = DOCS_BASE_URL + "/api/event-brokers/"
@@ -51,7 +53,7 @@
 
 DOCS_BASE_URL_RASA_X = "https://rasa.com/docs/rasa-x"
 
-LEGACY_DOCS_BASE_URL = "http://legacy-docs.rasa.com"
+LEGACY_DOCS_BASE_URL = "https://legacy-docs-v1.rasa.com"
 
 CONFIG_KEYS_CORE = ["policies"]
 CONFIG_KEYS_NLU = ["language", "pipeline"]
diff --git a/rasa/core/agent.py b/rasa/core/agent.py
index f1772db7e18b..c41a530ef743 100644
--- a/rasa/core/agent.py
+++ b/rasa/core/agent.py
@@ -1,52 +1,47 @@
+from asyncio import CancelledError
 import logging
 import os
 import shutil
 import tempfile
-import uuid
-from asyncio import CancelledError
 from typing import Any, Callable, Dict, List, Optional, Text, Tuple, Union
+import uuid
 
 import aiohttp
-from sanic import Sanic
+from aiohttp import ClientError
 
 import rasa
-import rasa.utils.io
-import rasa.core.utils
-from rasa.constants import (
-    DEFAULT_DOMAIN_PATH,
-    LEGACY_DOCS_BASE_URL,
-    ENV_SANIC_BACKLOG,
-    DEFAULT_CORE_SUBDIRECTORY_NAME,
-)
-from rasa.core import constants, jobs, training
-from rasa.core.channels.channel import InputChannel, OutputChannel, UserMessage
+from rasa.constants import DEFAULT_CORE_SUBDIRECTORY_NAME, DEFAULT_DOMAIN_PATH
+from rasa.core import jobs, training
+from rasa.core.channels.channel import OutputChannel, UserMessage
 from rasa.core.constants import DEFAULT_REQUEST_TIMEOUT
 from rasa.core.domain import Domain
 from rasa.core.exceptions import AgentNotReady
 from rasa.core.interpreter import NaturalLanguageInterpreter, RegexInterpreter
-from rasa.core.lock_store import LockStore, InMemoryLockStore
+from rasa.core.lock_store import InMemoryLockStore, LockStore
 from rasa.core.nlg import NaturalLanguageGenerator
 from rasa.core.policies.ensemble import PolicyEnsemble, SimplePolicyEnsemble
 from rasa.core.policies.memoization import MemoizationPolicy
 from rasa.core.policies.policy import Policy
 from rasa.core.processor import MessageProcessor
 from rasa.core.tracker_store import (
+    FailSafeTrackerStore,
     InMemoryTrackerStore,
     TrackerStore,
-    FailSafeTrackerStore,
 )
 from rasa.core.trackers import DialogueStateTracker
+import rasa.core.utils
 from rasa.exceptions import ModelNotFound
 from rasa.importers.importer import TrainingDataImporter
 from rasa.model import (
-    get_model_subdirectories,
     get_latest_model,
-    unpack_model,
     get_model,
+    get_model_subdirectories,
+    unpack_model,
 )
 from rasa.nlu.utils import is_url
-from rasa.utils.common import raise_warning, update_sanic_log_level
+from rasa.utils.common import raise_warning
 from rasa.utils.endpoints import EndpointConfig
+import rasa.utils.io
 
 logger = logging.getLogger(__name__)
 
@@ -137,7 +132,8 @@ def _load_and_set_updated_model(
         )
 
         logger.debug("Finished updating agent to new model.")
-    except Exception as e:
+    except Exception as e:  # skipcq: PYL-W0703
+        # TODO: this exception shouldn't be that broad, we need to be more specific
         logger.exception(
             f"Failed to update model. The previous model will stay loaded instead. "
             f"Error: {e}"
@@ -238,7 +234,7 @@ async def _run_model_pulling_worker(
         await _update_model_from_server(model_server, agent)
     except CancelledError:
         logger.warning("Stopping model pulling (cancelled).")
-    except Exception:
+    except ClientError:
         logger.exception(
             "An exception was raised while fetching a model. Continuing anyways..."
         )
@@ -403,9 +399,9 @@ def load(
         try:
             if not model_path:
                 raise ModelNotFound("No path specified.")
-            elif not os.path.exists(model_path):
+            if not os.path.exists(model_path):
                 raise ModelNotFound(f"No file or directory at '{model_path}'.")
-            elif os.path.isfile(model_path):
+            if os.path.isfile(model_path):
                 model_path = get_model(model_path)
         except ModelNotFound:
             raise ValueError(
@@ -487,7 +483,7 @@ async def parse_message_using_nlu_interpreter(
 
         processor = self.create_processor()
         message = UserMessage(message_data)
-        return await processor._parse_message(message, tracker)
+        return await processor.parse_message(message, tracker)
 
     async def handle_message(
         self,
@@ -504,9 +500,8 @@ async def handle_message(
                 "not supported anymore. Rather use `agent.handle_text(...)`."
             )
 
-        def noop(_):
+        def noop(_: Any) -> None:
             logger.info("Ignoring message as there is no agent to handle it.")
-            return None
 
         if not self.is_ready():
             return noop(message)
@@ -618,7 +613,7 @@ def toggle_memoization(self, activate: bool) -> None:
 
         for p in self.policy_ensemble.policies:
             # explicitly ignore inheritance (e.g. augmented memoization policy)
-            if type(p) == MemoizationPolicy:
+            if type(p) is MemoizationPolicy:
                 p.toggle(activate)
 
     def _max_history(self) -> int:
@@ -707,28 +702,6 @@ def train(
         if not self.is_core_ready():
             raise AgentNotReady("Can't train without a policy ensemble.")
 
-        # deprecation tests
-        if kwargs.get("featurizer"):
-            raise Exception(
-                "Passing `featurizer` "
-                "to `agent.train(...)` is not supported anymore. "
-                "Pass appropriate featurizer directly "
-                "to the policy configuration instead. More info "
-                "{}/core/migrations.html".format(LEGACY_DOCS_BASE_URL)
-            )
-        if (
-            kwargs.get("epochs")
-            or kwargs.get("max_history")
-            or kwargs.get("batch_size")
-        ):
-            raise Exception(
-                "Passing policy configuration parameters "
-                "to `agent.train(...)` is not supported "
-                "anymore. Specify parameters directly in the "
-                "policy configuration instead. More info "
-                "{}/core/migrations.html".format(LEGACY_DOCS_BASE_URL)
-            )
-
         if isinstance(training_trackers, str):
             # the user most likely passed in a file name to load training
             # data from
diff --git a/rasa/core/domain.py b/rasa/core/domain.py
index 1d8dc25c0cbd..f355be4ef945 100644
--- a/rasa/core/domain.py
+++ b/rasa/core/domain.py
@@ -1126,9 +1126,9 @@ def is_domain_file(filename: Text) -> bool:
         Returns:
             `True` if it's a domain file, otherwise `False`.
         """
-        from rasa.data import YAML_FILE_EXTENSIONS
+        from rasa.data import is_likely_yaml_file
 
-        if not Path(filename).suffix in YAML_FILE_EXTENSIONS:
+        if not is_likely_yaml_file(filename):
             return False
         try:
             content = rasa.utils.io.read_yaml_file(filename)
diff --git a/rasa/core/events/__init__.py b/rasa/core/events/__init__.py
index 0e13450c12e6..034903f2c0a0 100644
--- a/rasa/core/events/__init__.py
+++ b/rasa/core/events/__init__.py
@@ -56,18 +56,27 @@ def deserialise_entities(entities: Union[Text, List[Any]]) -> List[Dict[Text, An
     return [e for e in entities if isinstance(e, dict)]
 
 
-def md_format_message(text, intent, entities) -> Text:
-    from rasa.nlu.training_data.formats import MarkdownReader
+def md_format_message(
+    text: Text, intent: Optional[Text], entities: Union[Text, List[Any]]
+) -> Text:
+    """Uses NLU parser information to generate a message with inline entity annotations.
+
+    Arguments:
+        text: text of the message
+        intent: intent of the message
+        entities: entities of the message
+
+    Return:
+        Message with entities annotated inline, e.g.
+        `I am from [Berlin]{"entity": "city"}`.
+    """
     from rasa.nlu.training_data.formats.readerwriter import TrainingDataWriter
+    from rasa.nlu.training_data import entities_parser
 
-    message_from_md = MarkdownReader().parse_training_example(text)
+    message_from_md = entities_parser.parse_training_example(text, intent)
     deserialised_entities = deserialise_entities(entities)
     return TrainingDataWriter.generate_message(
-        {
-            "text": message_from_md.text,
-            "intent": intent,
-            "entities": deserialised_entities,
-        }
+        {"text": message_from_md.text, "entities": deserialised_entities,}
     )
 
 
@@ -227,16 +236,16 @@ def __init__(
 
         super().__init__(timestamp, metadata)
 
+        self.parse_data = {
+            "intent": self.intent,
+            "entities": self.entities,
+            "text": text,
+            "message_id": self.message_id,
+            "metadata": self.metadata,
+        }
+
         if parse_data:
-            self.parse_data = parse_data
-        else:
-            self.parse_data = {
-                "intent": self.intent,
-                "entities": self.entities,
-                "text": text,
-                "message_id": self.message_id,
-                "metadata": self.metadata,
-            }
+            self.parse_data.update(**parse_data)
 
     @staticmethod
     def _from_parse_data(
@@ -333,7 +342,9 @@ def as_story_string(self, e2e: bool = False) -> Text:
                 intent=self.intent.get(INTENT_NAME_KEY, ""), entities=ent_string
             )
             if e2e:
-                message = md_format_message(self.text, self.intent, self.entities)
+                message = md_format_message(
+                    self.text, self.intent.get("name"), self.entities
+                )
                 return "{}: {}".format(self.intent.get(INTENT_NAME_KEY), message)
             else:
                 return parse_string
diff --git a/rasa/core/interpreter.py b/rasa/core/interpreter.py
index b0ac7395091b..0236d8491aa4 100644
--- a/rasa/core/interpreter.py
+++ b/rasa/core/interpreter.py
@@ -1,3 +1,5 @@
+from json import JSONDecodeError
+
 import aiohttp
 
 import json
@@ -23,7 +25,7 @@ async def parse(
         self,
         text: Text,
         message_id: Optional[Text] = None,
-        tracker: DialogueStateTracker = None,
+        tracker: Optional[DialogueStateTracker] = None,
     ) -> Dict[Text, Any]:
         raise NotImplementedError(
             "Interpreter needs to be able to parse messages into structured output."
@@ -86,11 +88,11 @@ def _parse_parameters(
             if isinstance(parsed_entities, dict):
                 return RegexInterpreter._create_entities(parsed_entities, sidx, eidx)
             else:
-                raise Exception(
+                raise ValueError(
                     f"Parsed value isn't a json object "
                     f"(instead parser found '{type(parsed_entities)}')"
                 )
-        except Exception as e:
+        except (JSONDecodeError, ValueError) as e:
             raise_warning(
                 f"Failed to parse arguments in line "
                 f"'{user_input}'. Failed to decode parameters "
@@ -108,7 +110,7 @@ def _parse_confidence(confidence_str: Text) -> float:
 
         try:
             return float(confidence_str.strip()[1:])
-        except Exception as e:
+        except ValueError as e:
             raise_warning(
                 f"Invalid to parse confidence value in line "
                 f"'{confidence_str}'. Make sure the intent confidence is an "
@@ -149,18 +151,13 @@ async def parse(
         self,
         text: Text,
         message_id: Optional[Text] = None,
-        tracker: DialogueStateTracker = None,
+        tracker: Optional[DialogueStateTracker] = None,
     ) -> Dict[Text, Any]:
         """Parse a text message."""
 
-        return self.synchronous_parse(text, message_id, tracker)
+        return self.synchronous_parse(text)
 
-    def synchronous_parse(
-        self,
-        text: Text,
-        message_id: Optional[Text] = None,
-        tracker: DialogueStateTracker = None,
-    ) -> Dict[Text, Any]:
+    def synchronous_parse(self, text: Text,) -> Dict[Text, Any]:
         """Parse a text message."""
 
         intent, confidence, entities = self.extract_intent_and_entities(text)
@@ -189,7 +186,7 @@ async def parse(
         self,
         text: Text,
         message_id: Optional[Text] = None,
-        tracker: DialogueStateTracker = None,
+        tracker: Optional[DialogueStateTracker] = None,
     ) -> Dict[Text, Any]:
         """Parse a text message.
 
@@ -241,7 +238,9 @@ async def _rasa_http_parse(
                             f"http. Error: {response_text}"
                         )
                         return None
-        except Exception:
+        except Exception:  # skipcq: PYL-W0703
+            # need to catch all possible exceptions when doing http requests
+            # (timeouts, value errors, parser errors, ...)
             logger.exception(f"Failed to parse text '{text}' using rasa NLU over http.")
             return None
 
@@ -266,7 +265,7 @@ async def parse(
         self,
         text: Text,
         message_id: Optional[Text] = None,
-        tracker: DialogueStateTracker = None,
+        tracker: Optional[DialogueStateTracker] = None,
     ) -> Dict[Text, Any]:
         """Parse a text message.
 
diff --git a/rasa/core/processor.py b/rasa/core/processor.py
index 1926f2c88672..7d6e00c3c8e4 100644
--- a/rasa/core/processor.py
+++ b/rasa/core/processor.py
@@ -224,9 +224,6 @@ async def log_message(
         processing and saved at a later stage.
         """
 
-        # preprocess message if necessary
-        if self.message_preprocessor is not None:
-            message.text = self.message_preprocessor(message.text)
         # we have a Tracker instance for each user
         # which maintains conversation state
         tracker = await self.get_tracker_with_session_start(
@@ -441,18 +438,33 @@ def _check_for_unseen_features(self, parse_data: Dict[Text, Any]) -> None:
     def _get_action(self, action_name) -> Optional[Action]:
         return self.domain.action_for_name(action_name, self.action_endpoint)
 
-    async def _parse_message(self, message, tracker: DialogueStateTracker = None):
+    async def parse_message(
+        self, message: UserMessage, tracker: Optional[DialogueStateTracker] = None
+    ) -> Dict[Text, Any]:
+        """Interprete the passed message using the NLU interpreter.
+
+        Arguments:
+            message: Message to handle
+            tracker: Dialogue context of the message
+
+        Returns:
+            Parsed data extracted from the message.
+        """
+        # preprocess message if necessary
+        if self.message_preprocessor is not None:
+            text = self.message_preprocessor(message.text)
+        else:
+            text = message.text
+
         # for testing - you can short-cut the NLU part with a message
         # in the format /intent{"entity1": val1, "entity2": val2}
         # parse_data is a dict of intent & entities
-        if message.text.startswith(INTENT_MESSAGE_PREFIX):
+        if text.startswith(INTENT_MESSAGE_PREFIX):
             parse_data = await RegexInterpreter().parse(
-                message.text, message.message_id, tracker
+                text, message.message_id, tracker
             )
         else:
-            parse_data = await self.interpreter.parse(
-                message.text, message.message_id, tracker
-            )
+            parse_data = await self.interpreter.parse(text, message.message_id, tracker)
 
         logger.debug(
             "Received user message '{}' with intent '{}' "
@@ -472,7 +484,7 @@ async def _handle_message_with_tracker(
         if message.parse_data:
             parse_data = message.parse_data
         else:
-            parse_data = await self._parse_message(message, tracker)
+            parse_data = await self.parse_message(message, tracker)
 
         # don't ever directly mutate the tracker
         # - instead pass its events to log
diff --git a/rasa/core/schemas/stories.yml b/rasa/core/schemas/stories.yml
index 6fa74c67e973..345484ca1e4a 100644
--- a/rasa/core/schemas/stories.yml
+++ b/rasa/core/schemas/stories.yml
@@ -26,6 +26,9 @@ mapping:
                 type: "str"
                 required: True
                 allowempty: False
+              user:
+                type: "str"
+                allowempty: False
               entities:
                 type: "seq"
                 matching: "any"
diff --git a/rasa/core/test.py b/rasa/core/test.py
index abfd665144fe..92cda8dc2c04 100644
--- a/rasa/core/test.py
+++ b/rasa/core/test.py
@@ -3,22 +3,25 @@
 import warnings
 import typing
 from collections import defaultdict, namedtuple
-from typing import Any, Dict, List, Optional, Text, Tuple, Union
+from typing import Any, Dict, List, Optional, Text, Tuple
 
+from rasa.core.channels import UserMessage
+from rasa.core.training.story_writer.yaml_story_writer import YAMLStoryWriter
 import rasa.utils.io as io_utils
 from rasa.core.domain import Domain
 from rasa.nlu.constants import (
+    ENTITIES,
     EXTRACTOR,
     ENTITY_ATTRIBUTE_VALUE,
     ENTITY_ATTRIBUTE_TEXT,
     ENTITY_ATTRIBUTE_START,
     ENTITY_ATTRIBUTE_END,
     ENTITY_ATTRIBUTE_TYPE,
+    INTENT,
 )
 from rasa.constants import RESULTS_FILE, PERCENTAGE_KEY
 from rasa.core.utils import pad_lists_to_size
 from rasa.core.events import ActionExecuted, UserUttered
-from rasa.nlu.training_data.formats.markdown import MarkdownWriter
 from rasa.core.trackers import DialogueStateTracker
 from rasa.nlu.training_data.formats.readerwriter import TrainingDataWriter
 from rasa.utils.io import DEFAULT_ENCODING
@@ -30,8 +33,8 @@
 
 CONFUSION_MATRIX_STORIES_FILE = "story_confusion_matrix.png"
 REPORT_STORIES_FILE = "story_report.json"
-FAILED_STORIES_FILE = "failed_stories.md"
-SUCCESSFUL_STORIES_FILE = "successful_stories.md"
+FAILED_STORIES_FILE = "failed_test_stories.yml"
+SUCCESSFUL_STORIES_FILE = "successful_test_stories.yml"
 
 
 logger = logging.getLogger(__name__)
@@ -69,21 +72,21 @@ def __init__(
 
     def add_to_store(
         self,
-        action_predictions: Optional[Union[Text, List[Text]]] = None,
-        action_targets: Optional[Union[Text, List[Text]]] = None,
-        intent_predictions: Optional[Union[Text, List[Text]]] = None,
-        intent_targets: Optional[Union[Text, List[Text]]] = None,
+        action_predictions: Optional[List[Text]] = None,
+        action_targets: Optional[List[Text]] = None,
+        intent_predictions: Optional[List[Text]] = None,
+        intent_targets: Optional[List[Text]] = None,
         entity_predictions: Optional[List[Dict[Text, Any]]] = None,
         entity_targets: Optional[List[Dict[Text, Any]]] = None,
     ) -> None:
         """Add items or lists of items to the store"""
-        for k, v in locals().items():
-            if k != "self" and v:
-                attr = getattr(self, k)
-                if isinstance(v, list):
-                    attr.extend(v)
-                else:
-                    attr.append(v)
+
+        self.action_predictions.extend(action_predictions or [])
+        self.action_targets.extend(action_targets or [])
+        self.intent_targets.extend(intent_targets or [])
+        self.intent_predictions.extend(intent_predictions or [])
+        self.entity_predictions.extend(entity_predictions or [])
+        self.entity_targets.extend(entity_targets or [])
 
     def merge_store(self, other: "EvaluationStore") -> None:
         """Add the contents of other to self"""
@@ -136,20 +139,30 @@ class WronglyPredictedAction(ActionExecuted):
     type_name = "wrong_action"
 
     def __init__(
-        self, correct_action, predicted_action, policy, confidence, timestamp=None
+        self,
+        action_name_target: Text,
+        action_name_prediction: Text,
+        policy: Optional[Text] = None,
+        confidence: Optional[float] = None,
+        timestamp: Optional[float] = None,
+        metadata: Optional[Dict] = None,
     ) -> None:
-        self.predicted_action = predicted_action
-        super().__init__(correct_action, policy, confidence, timestamp=timestamp)
+        self.action_name_prediction = action_name_prediction
+        super().__init__(action_name_target, policy, confidence, timestamp, metadata)
+
+    def inline_comment(self) -> Text:
+        """A comment attached to this event. Used during dumping."""
+        return f"predicted: {self.action_name_prediction}"
 
     def as_story_string(self) -> Text:
-        return f"{self.action_name}   <!-- predicted: {self.predicted_action} -->"
+        return f"{self.action_name}   <!-- {self.inline_comment()} -->"
 
 
 class EndToEndUserUtterance(UserUttered):
     """End-to-end user utterance.
 
     Mostly used to print the full end-to-end user message in the
-    `failed_stories.md` output file."""
+    `failed_test_stories.yml` output file."""
 
     def as_story_string(self, e2e: bool = True) -> Text:
         return super().as_story_string(e2e=True)
@@ -182,16 +195,24 @@ def __init__(self, event: UserUttered, eval_store: EvaluationStore) -> None:
             event.input_channel,
         )
 
-    def as_story_string(self, e2e: bool = True) -> Text:
+    def inline_comment(self) -> Text:
+        """A comment attached to this event. Used during dumping."""
         from rasa.core.events import md_format_message
 
-        correct_message = md_format_message(self.text, self.intent, self.entities)
         predicted_message = md_format_message(
             self.text, self.predicted_intent, self.predicted_entities
         )
+        return f"predicted: {self.predicted_intent}: {predicted_message}"
+
+    def as_story_string(self, e2e: bool = True) -> Text:
+        from rasa.core.events import md_format_message
+
+        correct_message = md_format_message(
+            self.text, self.intent.get("name"), self.entities
+        )
         return (
-            f"{self.intent.get('name')}: {correct_message}   <!-- predicted: "
-            f"{self.predicted_intent}: {predicted_message} -->"
+            f"{self.intent.get('name')}: {correct_message}   "
+            f"<!-- {self.inline_comment()} -->"
         )
 
 
@@ -206,7 +227,7 @@ async def _generate_trackers(
     from rasa.core import training
 
     story_graph = await training.extract_story_graph(
-        resource_name, agent.domain, agent.interpreter, use_e2e
+        resource_name, agent.domain, use_e2e
     )
     g = TrainingDataGenerator(
         story_graph,
@@ -245,23 +266,21 @@ def _clean_entity_results(
 
 def _collect_user_uttered_predictions(
     event: UserUttered,
+    predicted: Dict[Text, Any],
     partial_tracker: DialogueStateTracker,
     fail_on_prediction_errors: bool,
 ) -> EvaluationStore:
     user_uttered_eval_store = EvaluationStore()
 
-    intent_gold = event.parse_data.get("true_intent")
-    predicted_intent = event.parse_data.get("intent", {}).get("name")
-
-    if not predicted_intent:
-        predicted_intent = [None]
+    intent_gold = event.intent.get("name")
+    predicted_intent = predicted.get(INTENT, {}).get("name")
 
     user_uttered_eval_store.add_to_store(
-        intent_predictions=predicted_intent, intent_targets=intent_gold
+        intent_predictions=[predicted_intent], intent_targets=[intent_gold]
     )
 
-    entity_gold = event.parse_data.get("true_entities")
-    predicted_entities = event.parse_data.get("entities")
+    entity_gold = event.entities
+    predicted_entities = predicted.get(ENTITIES)
 
     if entity_gold or predicted_entities:
         user_uttered_eval_store.add_to_store(
@@ -276,7 +295,9 @@ def _collect_user_uttered_predictions(
         if fail_on_prediction_errors:
             raise ValueError(
                 "NLU model predicted a wrong intent. Failed Story:"
-                " \n\n{}".format(partial_tracker.export_stories())
+                " \n\n{}".format(
+                    YAMLStoryWriter().dumps(partial_tracker.as_story().story_steps)
+                )
             )
     else:
         end_to_end_user_utterance = EndToEndUserUtterance(
@@ -334,7 +355,7 @@ def _collect_action_executed_predictions(
             predicted = action.name()
 
     action_executed_eval_store.add_to_store(
-        action_predictions=predicted, action_targets=gold
+        action_predictions=[predicted], action_targets=[gold]
     )
 
     if action_executed_eval_store.has_prediction_target_mismatch():
@@ -346,7 +367,9 @@ def _collect_action_executed_predictions(
         if fail_on_prediction_errors:
             error_msg = (
                 "Model predicted a wrong action. Failed Story: "
-                "\n\n{}".format(partial_tracker.export_stories())
+                "\n\n{}".format(
+                    YAMLStoryWriter().dumps(partial_tracker.as_story().story_steps)
+                )
             )
             if FormPolicy.__name__ in policy:
                 error_msg += (
@@ -372,7 +395,7 @@ def _form_might_have_been_rejected(
     )
 
 
-def _predict_tracker_actions(
+async def _predict_tracker_actions(
     tracker: DialogueStateTracker,
     agent: "Agent",
     fail_on_prediction_errors: bool = False,
@@ -426,8 +449,9 @@ def _predict_tracker_actions(
             num_predicted_actions += 1
 
         elif use_e2e and isinstance(event, UserUttered):
+            predicted = await processor.parse_message(UserMessage(event.text))
             user_uttered_result = _collect_user_uttered_predictions(
-                event, partial_tracker, fail_on_prediction_errors
+                event, predicted, partial_tracker, fail_on_prediction_errors
             )
 
             tracker_eval_store.merge_store(user_uttered_result)
@@ -451,10 +475,10 @@ def _in_training_data_fraction(action_list: List[Dict[Text, Any]]) -> float:
         if a["policy"] and not SimplePolicyEnsemble.is_not_memo_policy(a["policy"])
     ]
 
-    return len(in_training_data) / len(action_list)
+    return len(in_training_data) / len(action_list) if action_list else 0
 
 
-def _collect_story_predictions(
+async def _collect_story_predictions(
     completed_trackers: List["DialogueStateTracker"],
     agent: "Agent",
     fail_on_prediction_errors: bool = False,
@@ -475,7 +499,11 @@ def _collect_story_predictions(
     action_list = []
 
     for tracker in tqdm(completed_trackers):
-        tracker_results, predicted_tracker, tracker_actions = _predict_tracker_actions(
+        (
+            tracker_results,
+            predicted_tracker,
+            tracker_actions,
+        ) = await _predict_tracker_actions(
             tracker, agent, fail_on_prediction_errors, use_e2e
         )
 
@@ -525,22 +553,16 @@ def _collect_story_predictions(
     )
 
 
-def _log_stories(
-    stories: List[DialogueStateTracker], filename: Text, out_directory: Text
-) -> None:
+def _log_stories(trackers: List[DialogueStateTracker], file_path: Text) -> None:
     """Write given stories to the given file."""
-    if not out_directory:
-        return
 
-    with open(
-        os.path.join(out_directory, filename), "w", encoding=DEFAULT_ENCODING
-    ) as f:
-        if not stories:
-            f.write("<!-- No stories found. -->")
-
-        for story in stories:
-            f.write(story.export_stories(include_source=True))
-            f.write("\n\n")
+    with open(file_path, "w", encoding=DEFAULT_ENCODING) as f:
+        if not trackers:
+            f.write("# None of the test stories failed - all good!")
+        else:
+            stories = [tracker.as_story(include_source=True) for tracker in trackers]
+            steps = [step for story in stories for step in story.story_steps]
+            f.write(YAMLStoryWriter().dumps(steps))
 
 
 async def test(
@@ -576,7 +598,7 @@ async def test(
 
     completed_trackers = await _generate_trackers(stories, agent, max_stories, e2e)
 
-    story_evaluation, _ = _collect_story_predictions(
+    story_evaluation, _ = await _collect_story_predictions(
         completed_trackers, agent, fail_on_prediction_errors, e2e
     )
 
@@ -613,20 +635,22 @@ async def test(
         include_report=False,
     )
 
-    if not disable_plotting:
+    if not disable_plotting and out_directory:
         _plot_story_evaluation(
             evaluation_store.action_targets,
             evaluation_store.action_predictions,
             out_directory,
         )
 
-    if errors:
+    if errors and out_directory:
         _log_stories(
-            story_evaluation.failed_stories, FAILED_STORIES_FILE, out_directory
+            story_evaluation.failed_stories,
+            os.path.join(out_directory, FAILED_STORIES_FILE),
         )
-    if successes:
+    if successes and out_directory:
         _log_stories(
-            story_evaluation.successful_stories, SUCCESSFUL_STORIES_FILE, out_directory
+            story_evaluation.successful_stories,
+            os.path.join(out_directory, SUCCESSFUL_STORIES_FILE),
         )
 
     return {
@@ -745,7 +769,7 @@ async def _evaluate_core_model(model: Text, stories_file: Text) -> int:
 
     agent = Agent.load(model)
     completed_trackers = await _generate_trackers(stories_file, agent)
-    story_eval_store, number_of_stories = _collect_story_predictions(
+    story_eval_store, number_of_stories = await _collect_story_predictions(
         completed_trackers, agent
     )
     failed_stories = story_eval_store.failed_stories
diff --git a/rasa/core/trackers.py b/rasa/core/trackers.py
index 547087971ee1..dabb243cc25f 100644
--- a/rasa/core/trackers.py
+++ b/rasa/core/trackers.py
@@ -16,6 +16,8 @@
     Union,
 )
 
+import typing
+
 from rasa.nlu.constants import (
     ENTITY_ATTRIBUTE_VALUE,
     ENTITY_ATTRIBUTE_TYPE,
@@ -42,6 +44,10 @@
 from rasa.core.slots import Slot
 from rasa.utils import common as common_utils
 
+
+if typing.TYPE_CHECKING:
+    from rasa.core.training.structures import Story
+
 logger = logging.getLogger(__name__)
 
 
@@ -528,7 +534,7 @@ def update(self, event: Event, domain: Optional[Domain] = None) -> None:
             for e in domain.slots_for_entities(event.parse_data["entities"]):
                 self.update(e)
 
-    def export_stories(self, e2e: bool = False, include_source: bool = False) -> Text:
+    def as_story(self, include_source: bool = False) -> "Story":
         """Dump the tracker as a story in the Rasa Core story format.
 
         Returns the dumped tracker as a string."""
@@ -539,7 +545,18 @@ def export_stories(self, e2e: bool = False, include_source: bool = False) -> Tex
             if include_source
             else self.sender_id
         )
-        story = Story.from_events(self.applied_events(), story_name)
+        return Story.from_events(self.applied_events(), story_name)
+
+    def export_stories(self, e2e: bool = False, include_source: bool = False) -> Text:
+        """Dump the tracker as a story in the Rasa Core story format.
+
+        Returns:
+            The dumped tracker as a string.
+        """
+        # TODO: we need to revisit all usages of this, the caller needs to specify
+        #       the format. this likely points to areas where we are not properly
+        #       handling markdown vs yaml
+        story = self.as_story(include_source)
         return story.as_story_string(flat=True, e2e=e2e)
 
     def export_stories_to_file(self, export_path: Text = "debug.md") -> None:
diff --git a/rasa/core/training/__init__.py b/rasa/core/training/__init__.py
index 3b77dc45606b..3d43493c33bf 100644
--- a/rasa/core/training/__init__.py
+++ b/rasa/core/training/__init__.py
@@ -2,7 +2,6 @@
 
 if TYPE_CHECKING:
     from rasa.core.domain import Domain
-    from rasa.core.interpreter import NaturalLanguageInterpreter
     from rasa.core.trackers import DialogueStateTracker
     from rasa.core.training.structures import StoryGraph
     from rasa.importers.importer import TrainingDataImporter
@@ -11,20 +10,15 @@
 async def extract_rule_data(
     resource_name: Text,
     domain: "Domain",
-    interpreter: Optional["NaturalLanguageInterpreter"] = None,
     use_e2e: bool = False,
     exclusion_percentage: int = None,
 ) -> "StoryGraph":
-    from rasa.core.interpreter import RegexInterpreter
     from rasa.core.training import loading
     from rasa.core.training.structures import StoryGraph
 
-    if not interpreter:
-        interpreter = RegexInterpreter()
     story_steps = await loading.load_data_from_resource(
         resource_name,
         domain,
-        interpreter,
         use_e2e=use_e2e,
         exclusion_percentage=exclusion_percentage,
     )
@@ -34,20 +28,15 @@ async def extract_rule_data(
 async def extract_story_graph(
     resource_name: Text,
     domain: "Domain",
-    interpreter: Optional["NaturalLanguageInterpreter"] = None,
     use_e2e: bool = False,
     exclusion_percentage: Optional[int] = None,
 ) -> "StoryGraph":
-    from rasa.core.interpreter import RegexInterpreter
     from rasa.core.training.structures import StoryGraph
     import rasa.core.training.loading as core_loading
 
-    if not interpreter:
-        interpreter = RegexInterpreter()
     story_steps = await core_loading.load_data_from_resource(
         resource_name,
         domain,
-        interpreter,
         use_e2e=use_e2e,
         exclusion_percentage=exclusion_percentage,
     )
@@ -62,9 +51,32 @@ async def load_data(
     augmentation_factor: int = 50,
     tracker_limit: Optional[int] = None,
     use_story_concatenation: bool = True,
-    debug_plots=False,
+    debug_plots: bool = False,
     exclusion_percentage: Optional[int] = None,
 ) -> List["DialogueStateTracker"]:
+    """
+    Load training data from a resource.
+
+    Args:
+        resource_name: resource to load the data from. either a path or an importer
+        domain: domain used for loading
+        remove_duplicates: should duplicated training examples be removed?
+        unique_last_num_states: number of states in a conversation that make the
+            a tracker unique (this is used to identify duplicates)
+        augmentation_factor:
+            by how much should the story training data be augmented
+        tracker_limit:
+            maximum number of trackers to generate during augmentation
+        use_story_concatenation:
+            should stories be concatenated when doing data augmentation
+        debug_plots:
+            generate debug plots during loading
+        exclusion_percentage:
+            how much data to exclude
+
+    Returns:
+        list of loaded trackers
+    """
     from rasa.core.training.generator import TrainingDataGenerator
     from rasa.importers.importer import TrainingDataImporter
 
diff --git a/rasa/core/training/dsl.py b/rasa/core/training/dsl.py
deleted file mode 100644
index 4a52a414cb1a..000000000000
--- a/rasa/core/training/dsl.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import logging
-import re
-from typing import Optional, Text, TYPE_CHECKING
-
-from rasa.constants import DOCS_BASE_URL
-from rasa.core.constants import INTENT_MESSAGE_PREFIX
-from rasa.core.interpreter import RegexInterpreter
-from rasa.core.training.structures import FORM_PREFIX
-from rasa.nlu.training_data.formats import MarkdownReader
-
-if TYPE_CHECKING:
-    from rasa.nlu.training_data import Message
-
-logger = logging.getLogger(__name__)
-
-
-class EndToEndReader(MarkdownReader):
-    def __init__(self) -> None:
-        super().__init__()
-        self._regex_interpreter = RegexInterpreter()
-
-    def _parse_item(self, line: Text) -> Optional["Message"]:
-        f"""Parses an md list item line based on the current section type.
-
-        Matches expressions of the form `<intent>:<example>. For the
-        syntax of <example> see the Rasa docs on NLU training data:
-        {DOCS_BASE_URL}/nlu/training-data-format/#markdown-format"""
-
-        # Match three groups:
-        # 1) Potential "form" annotation
-        # 2) The correct intent
-        # 3) Optional entities
-        # 4) The message text
-        form_group = fr"({FORM_PREFIX}\s*)*"
-        item_regex = re.compile(r"\s*" + form_group + r"([^{}]+?)({.*})*:\s*(.*)")
-        match = re.match(item_regex, line)
-
-        if not match:
-            raise ValueError(
-                "Encountered invalid end-to-end format for message "
-                "`{}`. Please visit the documentation page on "
-                "end-to-end testing at {}/user-guide/testing-your-assistant/"
-                "#end-to-end-testing/".format(line, DOCS_BASE_URL)
-            )
-
-        intent = match.group(2)
-        self.current_title = intent
-        message = match.group(4)
-        example = self.parse_training_example(message)
-
-        # If the message starts with the `INTENT_MESSAGE_PREFIX` potential entities
-        # are annotated in the json format (e.g. `/greet{"name": "Rasa"})
-        if message.startswith(INTENT_MESSAGE_PREFIX):
-            parsed = self._regex_interpreter.synchronous_parse(message)
-            example.data["entities"] = parsed["entities"]
-
-        example.data["true_intent"] = intent
-        return example
diff --git a/rasa/core/training/interactive.py b/rasa/core/training/interactive.py
index 133bdbf82f39..3ac7560369ee 100644
--- a/rasa/core/training/interactive.py
+++ b/rasa/core/training/interactive.py
@@ -6,7 +6,6 @@
 import uuid
 from functools import partial
 from multiprocessing import Process
-from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Text, Tuple, Union, Set
 
 import numpy as np
@@ -23,14 +22,12 @@
 import rasa.cli.utils
 from questionary import Choice, Form, Question
 
-from rasa.cli import utils as cli_utils
 from rasa.core import constants, run, train, utils
 from rasa.core.actions.action import ACTION_LISTEN_NAME, default_action_names
 from rasa.core.channels.channel import UserMessage
 from rasa.core.constants import (
     DEFAULT_SERVER_FORMAT,
     DEFAULT_SERVER_PORT,
-    DEFAULT_SERVER_URL,
     REQUESTED_SLOT,
     UTTER_PREFIX,
 )
@@ -73,9 +70,9 @@
 MAX_VISUAL_HISTORY = 3
 
 PATHS = {
-    "stories": "data/stories.md",
-    "nlu": "data/nlu.md",
-    "backup": "data/nlu_interactive.md",
+    "stories": "data/stories.yml",
+    "nlu": "data/nlu.yml",
+    "backup": "data/nlu_interactive.yml",
     "domain": "domain.yml",
 }
 
@@ -259,7 +256,7 @@ def format_bot_output(message: BotUttered) -> Text:
 
     if data.get("buttons"):
         output += "\nButtons:"
-        choices = cli_utils.button_choices_from_message_data(
+        choices = rasa.cli.utils.button_choices_from_message_data(
             data, allow_free_text_input=True
         )
         for choice in choices:
@@ -268,13 +265,13 @@ def format_bot_output(message: BotUttered) -> Text:
     if data.get("elements"):
         output += "\nElements:"
         for idx, element in enumerate(data.get("elements")):
-            element_str = cli_utils.element_to_string(element, idx)
+            element_str = rasa.cli.utils.element_to_string(element, idx)
             output += "\n" + element_str
 
     if data.get("quick_replies"):
         output += "\nQuick replies:"
         for idx, element in enumerate(data.get("quick_replies")):
-            element_str = cli_utils.element_to_string(element, idx)
+            element_str = rasa.cli.utils.element_to_string(element, idx)
             output += "\n" + element_str
     return output
 
@@ -570,7 +567,7 @@ def _slot_history(tracker_dump: Dict[Text, Any]) -> List[Text]:
 
     slot_strings = []
     for k, s in tracker_dump.get("slots", {}).items():
-        colored_value = cli_utils.wrap_with_color(
+        colored_value = rasa.cli.utils.wrap_with_color(
             str(s), color=rasa.cli.utils.bcolors.WARNING
         )
         slot_strings.append(f"{k}: {colored_value}")
@@ -843,20 +840,16 @@ def _write_nlu_to_file(export_nlu_path: Text, events: List[Dict[Text, Any]]) ->
 
 
 def _get_nlu_target_format(export_path: Text) -> Text:
-    from rasa.data import (
-        YAML_FILE_EXTENSIONS,
-        MARKDOWN_FILE_EXTENSIONS,
-        JSON_FILE_EXTENSIONS,
-    )
+    from rasa import data
 
     guessed_format = loading.guess_format(export_path)
 
     if guessed_format not in {MARKDOWN, RASA, RASA_YAML}:
-        if Path(export_path).suffix in JSON_FILE_EXTENSIONS:
+        if data.is_likely_json_file(export_path):
             guessed_format = RASA
-        elif Path(export_path).suffix in MARKDOWN_FILE_EXTENSIONS:
+        elif data.is_likely_markdown_file(export_path):
             guessed_format = MARKDOWN
-        elif Path(export_path).suffix in YAML_FILE_EXTENSIONS:
+        elif data.is_likely_yaml_file(export_path):
             guessed_format = RASA_YAML
 
     return guessed_format
@@ -954,21 +947,20 @@ async def _predict_till_next_listen(
         if last_event.get("event") == BotUttered.type_name and last_event["data"].get(
             "buttons", None
         ):
-            response = _get_button_choice(last_event)
-            if response != cli_utils.FREE_TEXT_INPUT_PROMPT:
-                await send_message(endpoint, conversation_id, response)
+            user_selection = _get_button_choice(last_event)
+            if user_selection != rasa.cli.utils.FREE_TEXT_INPUT_PROMPT:
+                await send_message(endpoint, conversation_id, user_selection)
 
 
 def _get_button_choice(last_event: Dict[Text, Any]) -> Text:
     data = last_event["data"]
     message = last_event.get("text", "")
 
-    choices = cli_utils.button_choices_from_message_data(
+    choices = rasa.cli.utils.button_choices_from_message_data(
         data, allow_free_text_input=True
     )
     question = questionary.select(message, choices)
-    response = cli_utils.payload_from_button_question(question)
-    return response
+    return rasa.cli.utils.payload_from_button_question(question)
 
 
 async def _correct_wrong_nlu(
@@ -1224,7 +1216,7 @@ async def _correct_entities(
     """Validate the entities of a user message.
 
     Returns the corrected entities"""
-    from rasa.nlu.training_data.formats import MarkdownReader
+    from rasa.nlu.training_data import entities_parser
 
     parse_original = latest_message.get("parse_data", {})
     entity_str = _as_md_message(parse_original)
@@ -1233,8 +1225,7 @@ async def _correct_entities(
     )
 
     annotation = await _ask_questions(question, conversation_id, endpoint)
-    # noinspection PyProtectedMember
-    parse_annotated = MarkdownReader().parse_training_example(annotation)
+    parse_annotated = entities_parser.parse_training_example(annotation)
 
     corrected_entities = _merge_annotated_and_original_entities(
         parse_annotated, parse_original
diff --git a/rasa/core/training/loading.py b/rasa/core/training/loading.py
index fd80553bbbc2..5a10095c97e6 100644
--- a/rasa/core/training/loading.py
+++ b/rasa/core/training/loading.py
@@ -3,9 +3,9 @@
 from pathlib import Path
 from typing import Text, Optional, Dict, List, Union
 
+from rasa import data
 import rasa.utils.io as io_utils
 from rasa.core.domain import Domain
-from rasa.core.interpreter import NaturalLanguageInterpreter, RegexInterpreter
 from rasa.core.training.story_reader.markdown_story_reader import MarkdownStoryReader
 from rasa.core.training.story_reader.story_reader import StoryReader
 from rasa.core.training.story_reader.yaml_story_reader import YAMLStoryReader
@@ -18,40 +18,30 @@
 def _get_reader(
     filename: Text,
     domain: Domain,
-    interpreter: NaturalLanguageInterpreter = RegexInterpreter(),
     template_variables: Optional[Dict] = None,
     use_e2e: bool = False,
 ) -> StoryReader:
 
-    if Path(filename).suffix in MARKDOWN_FILE_EXTENSIONS:
-        return MarkdownStoryReader(
-            interpreter, domain, template_variables, use_e2e, filename
-        )
-    elif Path(filename).suffix in YAML_FILE_EXTENSIONS:
-        return YAMLStoryReader(
-            interpreter, domain, template_variables, use_e2e, filename
-        )
+    if data.is_likely_markdown_file(filename):
+        return MarkdownStoryReader(domain, template_variables, use_e2e, filename)
+    elif data.is_likely_yaml_file(filename):
+        return YAMLStoryReader(domain, template_variables, use_e2e, filename)
     else:
         # This is a use case for uploading the story over REST API.
         # The source file has a random name.
-        return _guess_reader(filename, domain, interpreter, template_variables, use_e2e)
+        return _guess_reader(filename, domain, template_variables, use_e2e)
 
 
 def _guess_reader(
     filename: Text,
     domain: Domain,
-    interpreter: NaturalLanguageInterpreter = RegexInterpreter(),
     template_variables: Optional[Dict] = None,
     use_e2e: bool = False,
 ) -> StoryReader:
     if YAMLStoryReader.is_yaml_story_file(filename):
-        return YAMLStoryReader(
-            interpreter, domain, template_variables, use_e2e, filename
-        )
+        return YAMLStoryReader(domain, template_variables, use_e2e, filename)
     elif MarkdownStoryReader.is_markdown_story_file(filename):
-        return MarkdownStoryReader(
-            interpreter, domain, template_variables, use_e2e, filename
-        )
+        return MarkdownStoryReader(domain, template_variables, use_e2e, filename)
     raise ValueError(
         f"Failed to find a reader class for the story file `{filename}`. "
         f"Supported formats are "
@@ -62,7 +52,6 @@ def _guess_reader(
 async def load_data_from_resource(
     resource: Union[Text, Path],
     domain: Domain,
-    interpreter: NaturalLanguageInterpreter = RegexInterpreter(),
     template_variables: Optional[Dict] = None,
     use_e2e: bool = False,
     exclusion_percentage: Optional[int] = None,
@@ -72,7 +61,6 @@ async def load_data_from_resource(
     Args:
         resource: Folder/File with core training data files.
         domain: Domain object.
-        interpreter: Interpreter to be used for parsing user's utterances.
         template_variables: Variables that have to be replaced in the training data.
         use_e2e: Identifies if the e2e reader should be used.
         exclusion_percentage: Identifies the percentage of training data that
@@ -87,7 +75,6 @@ async def load_data_from_resource(
     return await load_data_from_files(
         io_utils.list_files(resource),
         domain,
-        interpreter,
         template_variables,
         use_e2e,
         exclusion_percentage,
@@ -97,7 +84,6 @@ async def load_data_from_resource(
 async def load_data_from_files(
     story_files: List[Text],
     domain: Domain,
-    interpreter: NaturalLanguageInterpreter = RegexInterpreter(),
     template_variables: Optional[Dict] = None,
     use_e2e: bool = False,
     exclusion_percentage: Optional[int] = None,
@@ -107,7 +93,6 @@ async def load_data_from_files(
     Args:
         story_files: List of files with training data in it.
         domain: Domain object.
-        interpreter: Interpreter to be used for parsing user's utterances.
         template_variables: Variables that have to be replaced in the training data.
         use_e2e: Identifies whether the e2e reader should be used.
         exclusion_percentage: Identifies the percentage of training data that
@@ -120,9 +105,7 @@ async def load_data_from_files(
 
     for story_file in story_files:
 
-        reader = _get_reader(
-            story_file, domain, interpreter, template_variables, use_e2e
-        )
+        reader = _get_reader(story_file, domain, template_variables, use_e2e)
 
         steps = await reader.read_from_file(story_file)
         story_steps.extend(steps)
diff --git a/rasa/core/training/story_reader/markdown_story_reader.py b/rasa/core/training/story_reader/markdown_story_reader.py
index c51191ede65b..813517b36daf 100644
--- a/rasa/core/training/story_reader/markdown_story_reader.py
+++ b/rasa/core/training/story_reader/markdown_story_reader.py
@@ -1,21 +1,25 @@
-import asyncio
 import json
 import logging
 import os
 import re
-from pathlib import PurePath, Path
+from pathlib import Path
 from typing import Dict, Text, List, Any, Union
 
+import rasa.data
+from rasa.nlu.training_data import Message
 import rasa.utils.io as io_utils
-from rasa.constants import DOCS_URL_DOMAINS, DOCS_URL_STORIES
+from rasa.constants import (
+    DEFAULT_E2E_TESTS_PATH,
+    DOCS_URL_DOMAINS,
+    DOCS_URL_STORIES,
+    LEGACY_DOCS_BASE_URL,
+)
 from rasa.core.constants import INTENT_MESSAGE_PREFIX
 from rasa.core.events import UserUttered
 from rasa.core.exceptions import StoryParseError
 from rasa.core.interpreter import RegexInterpreter
-from rasa.core.training.dsl import EndToEndReader
 from rasa.core.training.story_reader.story_reader import StoryReader
 from rasa.core.training.structures import StoryStep, FORM_PREFIX
-from rasa.data import MARKDOWN_FILE_EXTENSIONS
 from rasa.nlu.constants import INTENT_NAME_KEY
 from rasa.utils.common import raise_warning
 
@@ -101,7 +105,7 @@ async def _process_lines(self, lines: List[Text]) -> List[StoryStep]:
             except Exception as e:
                 msg = f"Error in line {line_num}: {e}"
                 logger.error(msg, exc_info=1)  # pytype: disable=wrong-arg-types
-                raise ValueError(msg)
+                raise ValueError(msg) from e
         self._add_current_stories_to_result()
         return self.story_steps
 
@@ -173,15 +177,13 @@ def _parse_event_line(line):
             )
             return "", {}
 
-    async def _add_user_messages(self, messages, line_num):
+    async def _add_user_messages(self, messages: List[Text], line_num: int) -> None:
         if not self.current_step_builder:
             raise StoryParseError(
                 "User message '{}' at invalid location. "
                 "Expected story start.".format(messages)
             )
-        parsed_messages = await asyncio.gather(
-            *[self._parse_message(m, line_num) for m in messages]
-        )
+        parsed_messages = [self._parse_message(m, line_num) for m in messages]
         self.current_step_builder.add_user_messages(
             parsed_messages, self.unfold_or_utterances
         )
@@ -193,25 +195,57 @@ async def _add_e2e_messages(self, e2e_messages: List[Text], line_num: int) -> No
                 "location. Expected story start."
                 "".format(e2e_messages)
             )
-        e2e_reader = EndToEndReader()
+
         parsed_messages = []
         for m in e2e_messages:
-            message = e2e_reader._parse_item(m)
-            parsed = await self._parse_message(message.text, line_num)
-
-            parsed.parse_data["true_intent"] = message.data["true_intent"]
-            parsed.parse_data["true_entities"] = message.data.get("entities") or []
+            message = self.parse_e2e_message(m)
+            parsed = self._parse_message(message.text, line_num)
             parsed_messages.append(parsed)
         self.current_step_builder.add_user_messages(parsed_messages)
 
-    async def _parse_message(self, message: Text, line_num: int) -> UserUttered:
+    @staticmethod
+    def parse_e2e_message(line: Text) -> Message:
+        """Parses an md list item line based on the current section type.
+
+        Matches expressions of the form `<intent>:<example>`. For the
+        syntax of `<example>` see the Rasa docs on NLU training data."""
+
+        # Match three groups:
+        # 1) Potential "form" annotation
+        # 2) The correct intent
+        # 3) Optional entities
+        # 4) The message text
+        form_group = fr"({FORM_PREFIX}\s*)*"
+        item_regex = re.compile(r"\s*" + form_group + r"([^{}]+?)({.*})*:\s*(.*)")
+        match = re.match(item_regex, line)
+
+        if not match:
+            raise ValueError(
+                "Encountered invalid test story format for message "
+                "`{}`. Please visit the documentation page on "
+                "end-to-end testing at {}/user-guide/testing-your-assistant/"
+                "#end-to-end-testing/".format(line, LEGACY_DOCS_BASE_URL)
+            )
+        from rasa.nlu.training_data import entities_parser
+
+        intent = match.group(2)
+        message = match.group(4)
+        example = entities_parser.parse_training_example(message, intent)
+
+        # If the message starts with the `INTENT_MESSAGE_PREFIX` potential entities
+        # are annotated in the json format (e.g. `/greet{"name": "Rasa"})
         if message.startswith(INTENT_MESSAGE_PREFIX):
-            parse_data = await RegexInterpreter().parse(message)
-        else:
-            parse_data = await self.interpreter.parse(message)
+            parsed = RegexInterpreter().synchronous_parse(message)
+            example.data["entities"] = parsed["entities"]
+
+        return example
+
+    def _parse_message(self, message: Text, line_num: int) -> UserUttered:
+        parse_data = RegexInterpreter().synchronous_parse(message)
         utterance = UserUttered(
             message, parse_data.get("intent"), parse_data.get("entities"), parse_data
         )
+
         intent_name = utterance.intent.get(INTENT_NAME_KEY)
         if self.domain and intent_name not in self.domain.intents:
             raise_warning(
@@ -234,9 +268,9 @@ def is_markdown_story_file(file_path: Union[Text, Path]) -> bool:
             `True` in case the file is a Core Markdown training data or rule data file,
             `False` otherwise.
         """
-        suffix = PurePath(file_path).suffix
-
-        if suffix not in MARKDOWN_FILE_EXTENSIONS:
+        if not rasa.data.is_likely_markdown_file(file_path) or rasa.data.is_nlu_file(
+            file_path
+        ):
             return False
 
         try:
@@ -257,6 +291,26 @@ def is_markdown_story_file(file_path: Union[Text, Path]) -> bool:
             )
             return False
 
+    @staticmethod
+    def is_markdown_test_stories_file(file_path: Union[Text, Path]) -> bool:
+        """Checks if a file contains test stories.
+
+        Args:
+            file_path: Path of the file which should be checked.
+
+        Returns:
+            `True` if it's a file containing test stories, otherwise `False`.
+        """
+        if not rasa.data.is_likely_markdown_file(file_path):
+            return False
+
+        dirname = os.path.dirname(file_path)
+        return (
+            DEFAULT_E2E_TESTS_PATH in dirname
+            and rasa.data.is_story_file(file_path)
+            and not rasa.data.is_nlu_file(file_path)
+        )
+
     @staticmethod
     def _contains_story_or_rule_pattern(text: Text) -> bool:
         story_pattern = r".*##.+"
diff --git a/rasa/core/training/story_reader/story_reader.py b/rasa/core/training/story_reader/story_reader.py
index ffa028fc6b53..f0512110df33 100644
--- a/rasa/core/training/story_reader/story_reader.py
+++ b/rasa/core/training/story_reader/story_reader.py
@@ -4,7 +4,6 @@
 from rasa.core.domain import Domain
 from rasa.core.events import SlotSet, ActionExecuted, Event
 from rasa.core.exceptions import StoryParseError
-from rasa.core.interpreter import NaturalLanguageInterpreter
 from rasa.core.training.story_reader.story_step_builder import StoryStepBuilder
 from rasa.core.training.structures import StoryStep
 
@@ -16,7 +15,6 @@ class StoryReader:
 
     def __init__(
         self,
-        interpreter: NaturalLanguageInterpreter,
         domain: Optional[Domain] = None,
         template_vars: Optional[Dict] = None,
         use_e2e: bool = False,
@@ -26,7 +24,6 @@ def __init__(
         """Constructor for the StoryReader.
 
         Args:
-            interpreter: Interpreter to be used to parse intents.
             domain: Domain object.
             template_vars: Template variables to be replaced.
             use_e2e: Specifies whether to use the e2e parser or not.
@@ -41,7 +38,6 @@ def __init__(
         self.story_steps = []
         self.current_step_builder: Optional[StoryStepBuilder] = None
         self.domain = domain
-        self.interpreter = interpreter
         self.template_variables = template_vars if template_vars else {}
         self.use_e2e = use_e2e
         self.source_name = source_name
diff --git a/rasa/core/training/story_reader/yaml_story_reader.py b/rasa/core/training/story_reader/yaml_story_reader.py
index 218c32f8af5d..0177c2c4bb7c 100644
--- a/rasa/core/training/story_reader/yaml_story_reader.py
+++ b/rasa/core/training/story_reader/yaml_story_reader.py
@@ -2,19 +2,24 @@
 from pathlib import Path
 from typing import Dict, Text, List, Any, Optional, Union
 
+from rasa.nlu.training_data import entities_parser
 from rasa.utils.validation import validate_yaml_schema, InvalidYamlFileError
 from ruamel.yaml.parser import ParserError
 
 import rasa.utils.common as common_utils
 import rasa.utils.io as io_utils
-from rasa.constants import DOCS_URL_STORIES, DOCS_URL_RULES
+from rasa.constants import (
+    TEST_STORIES_FILE_PREFIX,
+    DOCS_URL_STORIES,
+    DOCS_URL_RULES,
+)
 from rasa.core.constants import INTENT_MESSAGE_PREFIX
 from rasa.core.actions.action import RULE_SNIPPET_ACTION_NAME
 from rasa.core.events import UserUttered, SlotSet, ActiveLoop
 from rasa.core.training.story_reader.story_reader import StoryReader
 from rasa.core.training.structures import StoryStep
-from rasa.data import YAML_FILE_EXTENSIONS
 from rasa.nlu.constants import INTENT_NAME_KEY
+import rasa.data
 
 logger = logging.getLogger(__name__)
 
@@ -25,6 +30,7 @@
 KEY_STEPS = "steps"
 KEY_ENTITIES = "entities"
 KEY_USER_INTENT = "intent"
+KEY_USER_MESSAGE = "user"
 KEY_SLOT_NAME = "slot_was_set"
 KEY_SLOT_VALUE = "value"
 KEY_ACTIVE_LOOP = "active_loop"
@@ -55,7 +61,6 @@ def from_reader(cls, reader: "YAMLStoryReader") -> "YAMLStoryReader":
             A new reader instance.
         """
         return cls(
-            reader.interpreter,
             reader.domain,
             reader.template_variables,
             reader.use_e2e,
@@ -117,8 +122,8 @@ def read_from_parsed_yaml(
 
         return self.story_steps
 
-    @staticmethod
-    def is_yaml_story_file(file_path: Text) -> bool:
+    @classmethod
+    def is_yaml_story_file(cls, file_path: Text) -> bool:
         """Check if file contains Core training data or rule data in YAML format.
 
         Args:
@@ -128,24 +133,56 @@ def is_yaml_story_file(file_path: Text) -> bool:
             `True` in case the file is a Core YAML training data or rule data file,
             `False` otherwise.
         """
-        suffix = Path(file_path).suffix
+        return rasa.data.is_likely_yaml_file(file_path) and cls.is_key_in_yaml(
+            file_path, KEY_STORIES, KEY_RULES
+        )
 
-        if suffix and suffix not in YAML_FILE_EXTENSIONS:
-            return False
+    @classmethod
+    def is_key_in_yaml(cls, file_path: Text, *keys: Text) -> bool:
+        """Check if all keys are contained in the parsed dictionary from a yaml file.
 
+        Arguments:
+            file_path: path to the yaml file
+            keys: keys to look for
+        Returns:
+              `True` if all the keys are contained in the file, `False` otherwise.
+        """
         try:
             content = io_utils.read_yaml_file(file_path)
-            return any(key in content for key in [KEY_STORIES, KEY_RULES])
+            return any(key in content for key in keys)
         except Exception as e:
             # Using broad `Exception` because yaml library is not exposing all Errors
             common_utils.raise_warning(
-                f"Tried to check if '{file_path}' is a story or rule file, but failed "
-                f"to read it. If this file contains story or rule data, you should "
-                f"investigate this error, otherwise it is probably best to "
-                f"move the file to a different location. Error: {e}"
+                f"Tried to open '{file_path}' and load its data, but failed "
+                f"to read it. There seems to be an error with the yaml syntax: {e}"
             )
             return False
 
+    @classmethod
+    def _has_test_prefix(cls, file_path: Text) -> bool:
+        """Check if the filename of a file at a path has a certain prefix.
+
+        Arguments:
+            file_path: path to the file
+
+        Returns:
+            `True` if the filename starts with the prefix, `False` otherwise.
+        """
+        return Path(file_path).name.startswith(TEST_STORIES_FILE_PREFIX)
+
+    @classmethod
+    def is_yaml_test_stories_file(cls, file_path: Union[Text, Path]) -> bool:
+        """Checks if a file is a test conversations file.
+
+        Args:
+            file_path: Path of the file which should be checked.
+
+        Returns:
+            `True` if it's a conversation test file, otherwise `False`.
+        """
+
+        return cls._has_test_prefix(file_path) and cls.is_yaml_story_file(file_path)
+
     def get_steps(self) -> List[StoryStep]:
         self._add_current_stories_to_result()
         return self.story_steps
@@ -215,7 +252,7 @@ def _parse_step(self, step: Union[Text, Dict[Text, Any]]) -> None:
                 f"'{RULE_SNIPPET_ACTION_NAME}'. It will be skipped.",
                 docs=self._get_docs_link(),
             )
-        elif KEY_USER_INTENT in step.keys():
+        elif KEY_USER_INTENT in step.keys() or KEY_USER_MESSAGE in step.keys():
             self._parse_user_utterance(step)
         elif KEY_OR in step.keys():
             self._parse_or_statement(step)
@@ -291,10 +328,10 @@ def _parse_or_statement(self, step: Dict[Text, Any]) -> None:
 
         self.current_step_builder.add_user_messages(utterances)
 
-    def _parse_raw_user_utterance(self, step: Dict[Text, Any]) -> Optional[UserUttered]:
-        user_utterance = step.get(KEY_USER_INTENT, "").strip()
+    def _user_intent_from_step(self, step: Dict[Text, Any]) -> Text:
+        user_intent = step.get(KEY_USER_INTENT, "").strip()
 
-        if not user_utterance:
+        if not user_intent:
             common_utils.raise_warning(
                 f"Issue found in '{self.source_name}':\n"
                 f"User utterance cannot be empty. "
@@ -303,22 +340,31 @@ def _parse_raw_user_utterance(self, step: Dict[Text, Any]) -> Optional[UserUtter
                 docs=self._get_docs_link(),
             )
 
-        raw_entities = step.get(KEY_ENTITIES, [])
-        final_entities = self._parse_raw_entities(raw_entities)
-
-        if user_utterance.startswith(INTENT_MESSAGE_PREFIX):
+        if user_intent.startswith(INTENT_MESSAGE_PREFIX):
             common_utils.raise_warning(
                 f"Issue found in '{self.source_name}':\n"
-                f"User intent '{user_utterance}' starts with "
+                f"User intent '{user_intent}' starts with "
                 f"'{INTENT_MESSAGE_PREFIX}'. This is not required.",
                 docs=self._get_docs_link(),
             )
             # Remove leading slash
-            user_utterance = user_utterance[1:]
+            user_intent = user_intent[1:]
+        return user_intent
 
-        intent = {"name": user_utterance, "confidence": 1.0}
+    def _parse_raw_user_utterance(self, step: Dict[Text, Any]) -> Optional[UserUttered]:
+        intent_name = self._user_intent_from_step(step)
+        intent = {"name": intent_name, "confidence": 1.0}
+
+        if KEY_USER_MESSAGE in step:
+            user_message = step[KEY_USER_MESSAGE].strip()
+            entities = entities_parser.find_entities_in_training_example(user_message)
+            plain_text = entities_parser.replace_entities(user_message)
+        else:
+            raw_entities = step.get(KEY_ENTITIES, [])
+            entities = self._parse_raw_entities(raw_entities)
+            plain_text = intent_name
 
-        return UserUttered(user_utterance, intent, final_entities)
+        return UserUttered(plain_text, intent, entities)
 
     @staticmethod
     def _parse_raw_entities(
diff --git a/rasa/core/training/story_writer/yaml_story_writer.py b/rasa/core/training/story_writer/yaml_story_writer.py
index 4d7740b399b5..bba815b83127 100644
--- a/rasa/core/training/story_writer/yaml_story_writer.py
+++ b/rasa/core/training/story_writer/yaml_story_writer.py
@@ -1,11 +1,16 @@
 from collections import OrderedDict
 from pathlib import Path
 
-import ruamel.yaml as ruamel_yaml
-from typing import List, Text, Union, Optional
+from ruamel import yaml
+from typing import Any, Dict, List, Text, Union, Optional
+
+from ruamel.yaml.comments import CommentedMap
+from ruamel.yaml.scalarstring import (
+    DoubleQuotedScalarString,
+    LiteralScalarString,
+)
 
 from rasa.utils.common import raise_warning
-from ruamel.yaml.scalarstring import DoubleQuotedScalarString
 
 from rasa.constants import LATEST_TRAINING_DATA_FORMAT_VERSION, DOCS_URL_STORIES
 from rasa.core.events import UserUttered, ActionExecuted, SlotSet, ActiveLoop
@@ -20,6 +25,7 @@
     KEY_SLOT_NAME,
     KEY_CHECKPOINT_SLOTS,
     KEY_OR,
+    KEY_USER_MESSAGE,
 )
 from rasa.core.training.structures import StoryStep, Checkpoint
 
@@ -34,18 +40,15 @@ def dumps(self, story_steps: List[StoryStep]) -> Text:
 
         Args:
             story_steps: Original story steps to be converted to the YAML.
-
         Returns:
             String with story steps in the YAML format.
         """
-        stream = ruamel_yaml.StringIO()
+        stream = yaml.StringIO()
         self.dump(stream, story_steps)
         return stream.getvalue()
 
     def dump(
-        self,
-        target: Union[Text, Path, ruamel_yaml.StringIO],
-        story_steps: List[StoryStep],
+        self, target: Union[Text, Path, yaml.StringIO], story_steps: List[StoryStep],
     ) -> None:
         """Writes Story steps into a target file/stream.
 
@@ -53,9 +56,17 @@ def dump(
             target: name of the target file/stream to write the YAML to.
             story_steps: Original story steps to be converted to the YAML.
         """
-        from rasa.validator import KEY_TRAINING_DATA_FORMAT_VERSION
+        result = self.stories_to_yaml(story_steps)
+
+        io_utils.write_yaml(result, target, True)
 
-        self.target = target
+    def stories_to_yaml(self, story_steps: List[StoryStep]) -> Dict[Text, Any]:
+        """Converts a sequence of story steps into yaml format.
+
+        Args:
+            story_steps: Original story steps to be converted to the YAML.
+        """
+        from rasa.validator import KEY_TRAINING_DATA_FORMAT_VERSION
 
         stories = []
         for story_step in story_steps:
@@ -67,9 +78,9 @@ def dump(
         result[KEY_TRAINING_DATA_FORMAT_VERSION] = DoubleQuotedScalarString(
             LATEST_TRAINING_DATA_FORMAT_VERSION
         )
-        result[KEY_STORIES] = stories
 
-        io_utils.write_yaml(result, self.target, True)
+        result[KEY_STORIES] = stories
+        return result
 
     def process_story_step(self, story_step: StoryStep) -> Optional[OrderedDict]:
         """Converts a single story step into an ordered dict.
@@ -82,7 +93,7 @@ def process_story_step(self, story_step: StoryStep) -> Optional[OrderedDict]:
         """
         if self.story_contains_forms(story_step):
             raise_warning(
-                f'File "{self.target}" contains a story "{story_step.block_name}" '
+                f'Training data file contains a story "{story_step.block_name}" '
                 f"that has form(s) in it. This story cannot be converted automatically "
                 f"because of the new Rules system in Rasa Open Source "
                 f"version {LATEST_TRAINING_DATA_FORMAT_VERSION}. "
@@ -128,6 +139,13 @@ def story_contains_forms(story_step) -> bool:
             [event for event in story_step.events if isinstance(event, ActiveLoop)]
         )
 
+    @staticmethod
+    def _text_is_real_message(user_utterance: UserUttered) -> bool:
+        return (
+            not user_utterance.intent
+            or user_utterance.text != user_utterance.as_story_string()
+        )
+
     @staticmethod
     def process_user_utterance(user_utterance: UserUttered) -> OrderedDict:
         """Converts a single user utterance into an ordered dict.
@@ -138,9 +156,17 @@ def process_user_utterance(user_utterance: UserUttered) -> OrderedDict:
         Returns:
             Dict with a user utterance.
         """
-        result = OrderedDict()
+        result = CommentedMap()
         result[KEY_USER_INTENT] = user_utterance.intent["name"]
 
+        if hasattr(user_utterance, "inline_comment"):
+            result.yaml_add_eol_comment(
+                user_utterance.inline_comment(), KEY_USER_INTENT
+            )
+
+        if YAMLStoryWriter._text_is_real_message(user_utterance):
+            result[KEY_USER_MESSAGE] = LiteralScalarString(user_utterance.text)
+
         if len(user_utterance.entities):
             entities = []
             for entity in user_utterance.entities:
@@ -162,9 +188,12 @@ def process_action(action: ActionExecuted) -> OrderedDict:
         Returns:
             Dict with an action.
         """
-        result = OrderedDict()
+        result = CommentedMap()
         result[KEY_ACTION] = action.action_name
 
+        if hasattr(action, "inline_comment"):
+            result.yaml_add_eol_comment(action.inline_comment(), KEY_ACTION)
+
         return result
 
     @staticmethod
diff --git a/rasa/core/training/structures.py b/rasa/core/training/structures.py
index e0af764aa4d3..0088b0683b4a 100644
--- a/rasa/core/training/structures.py
+++ b/rasa/core/training/structures.py
@@ -240,7 +240,7 @@ def __init__(
     def from_events(events: List[Event], story_name: Optional[Text] = None) -> "Story":
         """Create a story from a list of events."""
 
-        story_step = StoryStep()
+        story_step = StoryStep(story_name)
         for event in events:
             story_step.add_event(event)
         return Story([story_step], story_name)
diff --git a/rasa/data.py b/rasa/data.py
index e5bdd0ca62a6..e76e17c2eb2b 100644
--- a/rasa/data.py
+++ b/rasa/data.py
@@ -4,9 +4,8 @@
 import tempfile
 import uuid
 from pathlib import Path
-from typing import Tuple, List, Text, Set, Union, Optional, Iterable
+from typing import Callable, Tuple, List, Text, Set, Union, Optional, Iterable
 
-from rasa.constants import DEFAULT_E2E_TESTS_PATH
 from rasa.nlu.training_data import loading as nlu_loading
 
 logger = logging.getLogger(__name__)
@@ -22,6 +21,56 @@
 )
 
 
+def is_likely_yaml_file(file_path: Text) -> bool:
+    """Check if a file likely contains yaml.
+
+    Arguments:
+        file_path: path to the file
+
+    Returns:
+        `True` if the file likely contains data in yaml format, `False` otherwise.
+    """
+    return Path(file_path).suffix in YAML_FILE_EXTENSIONS
+
+
+def is_likely_json_file(file_path: Text) -> bool:
+    """Check if a file likely contains json.
+
+    Arguments:
+        file_path: path to the file
+
+    Returns:
+        `True` if the file likely contains data in json format, `False` otherwise.
+    """
+    return Path(file_path).suffix in JSON_FILE_EXTENSIONS
+
+
+def is_likely_markdown_file(file_path: Text) -> bool:
+    """Check if a file likely contains markdown.
+
+    Arguments:
+        file_path: path to the file
+
+    Returns:
+        `True` if the file likely contains data in markdown format,
+        `False` otherwise.
+    """
+    return Path(file_path).suffix in MARKDOWN_FILE_EXTENSIONS
+
+
+def get_test_directory(paths: Optional[Union[Text, List[Text]]],) -> Text:
+    """Recursively collects all Core training files from a list of paths.
+
+    Args:
+        paths: List of paths to training files or folders containing them.
+
+    Returns:
+        Path to temporary directory containing all found Core training files.
+    """
+    test_files = get_data_files(paths, is_test_stories_file)
+    return _copy_files_to_new_dir(test_files)
+
+
 def get_core_directory(paths: Optional[Union[Text, List[Text]]],) -> Text:
     """Recursively collects all Core training files from a list of paths.
 
@@ -31,7 +80,7 @@ def get_core_directory(paths: Optional[Union[Text, List[Text]]],) -> Text:
     Returns:
         Path to temporary directory containing all found Core training files.
     """
-    core_files, _ = get_core_nlu_files(paths)
+    core_files = get_data_files(paths, is_story_file)
     return _copy_files_to_new_dir(core_files)
 
 
@@ -44,7 +93,7 @@ def get_nlu_directory(paths: Optional[Union[Text, List[Text]]],) -> Text:
     Returns:
         Path to temporary directory containing all found NLU training files.
     """
-    _, nlu_files = get_core_nlu_files(paths)
+    nlu_files = get_data_files(paths, is_nlu_file)
     return _copy_files_to_new_dir(nlu_files)
 
 
@@ -61,7 +110,8 @@ def get_core_nlu_directories(
         containing the NLU training files.
     """
 
-    story_files, nlu_data_files = get_core_nlu_files(paths)
+    story_files = get_data_files(paths, is_story_file)
+    nlu_data_files = get_data_files(paths, is_nlu_file)
 
     story_directory = _copy_files_to_new_dir(story_files)
     nlu_directory = _copy_files_to_new_dir(nlu_data_files)
@@ -69,20 +119,20 @@ def get_core_nlu_directories(
     return story_directory, nlu_directory
 
 
-def get_core_nlu_files(
-    paths: Optional[Union[Text, List[Text]]]
-) -> Tuple[List[Text], List[Text]]:
+def get_data_files(
+    paths: Optional[Union[Text, List[Text]]], filter_predicate: Callable[[Text], bool]
+) -> List[Text]:
     """Recursively collects all training files from a list of paths.
 
     Args:
         paths: List of paths to training files or folders containing them.
+        filter_predicate: property to use when filtering the paths, e.g. `is_nlu_file`.
 
     Returns:
-        Tuple of paths to story and NLU files.
+        paths of training data files.
     """
 
-    story_files = set()
-    nlu_data_files = set()
+    data_files = set()
 
     if paths is None:
         paths = []
@@ -94,24 +144,19 @@ def get_core_nlu_files(
             continue
 
         if _is_valid_filetype(path):
-            if is_nlu_file(path):
-                nlu_data_files.add(os.path.abspath(path))
-            elif is_story_file(path):
-                story_files.add(os.path.abspath(path))
+            if filter_predicate(path):
+                data_files.add(os.path.abspath(path))
         else:
-            new_story_files, new_nlu_data_files = _find_core_nlu_files_in_directory(
-                path
-            )
+            new_data_files = _find_data_files_in_directory(path, filter_predicate)
+            data_files.update(new_data_files)
 
-            story_files.update(new_story_files)
-            nlu_data_files.update(new_nlu_data_files)
+    return sorted(data_files)
 
-    return sorted(story_files), sorted(nlu_data_files)
 
-
-def _find_core_nlu_files_in_directory(directory: Text,) -> Tuple[Set[Text], Set[Text]]:
-    story_files = set()
-    nlu_data_files = set()
+def _find_data_files_in_directory(
+    directory: Text, filter_property: Callable[[Text], bool]
+) -> Set[Text]:
+    filtered_files = set()
 
     for root, _, files in os.walk(directory, followlinks=True):
         # we sort the files here to ensure consistent order for repeatable training
@@ -122,12 +167,10 @@ def _find_core_nlu_files_in_directory(directory: Text,) -> Tuple[Set[Text], Set[
             if not _is_valid_filetype(full_path):
                 continue
 
-            if is_nlu_file(full_path):
-                nlu_data_files.add(full_path)
-            elif is_story_file(full_path):
-                story_files.add(full_path)
+            if filter_property(full_path):
+                filtered_files.add(full_path)
 
-    return story_files, nlu_data_files
+    return filtered_files
 
 
 def _is_valid_filetype(path: Text) -> bool:
@@ -156,37 +199,33 @@ def is_story_file(file_path: Text) -> bool:
         `True` if it's a story file, otherwise `False`.
     """
     from rasa.core.training.story_reader.yaml_story_reader import YAMLStoryReader
-
-    if YAMLStoryReader.is_yaml_story_file(file_path):
-        return True
-
     from rasa.core.training.story_reader.markdown_story_reader import (
         MarkdownStoryReader,
     )
 
-    return MarkdownStoryReader.is_markdown_story_file(file_path)
+    return YAMLStoryReader.is_yaml_story_file(
+        file_path
+    ) or MarkdownStoryReader.is_markdown_story_file(file_path)
 
 
-def is_end_to_end_conversation_test_file(file_path: Text) -> bool:
-    """Checks if a file is an end-to-end conversation test file.
+def is_test_stories_file(file_path: Text) -> bool:
+    """Checks if a file is a test stories file.
 
     Args:
         file_path: Path of the file which should be checked.
 
     Returns:
-        `True` if it's a conversation test file, otherwise `False`.
+        `True` if it's a story file containing tests, otherwise `False`.
     """
-
-    if Path(file_path).suffix not in MARKDOWN_FILE_EXTENSIONS:
-        return False
-
-    dirname = os.path.dirname(file_path)
-    return (
-        DEFAULT_E2E_TESTS_PATH in dirname
-        and is_story_file(file_path)
-        and not is_nlu_file(file_path)
+    from rasa.core.training.story_reader.yaml_story_reader import YAMLStoryReader
+    from rasa.core.training.story_reader.markdown_story_reader import (
+        MarkdownStoryReader,
     )
 
+    return YAMLStoryReader.is_yaml_story_file(
+        file_path
+    ) or MarkdownStoryReader.is_markdown_test_stories_file(file_path)
+
 
 def is_config_file(file_path: Text) -> bool:
     """Checks whether the given file path is a Rasa config file.
diff --git a/rasa/importers/autoconfig.py b/rasa/importers/autoconfig.py
index 91646833c41c..7033c14fc35c 100644
--- a/rasa/importers/autoconfig.py
+++ b/rasa/importers/autoconfig.py
@@ -126,6 +126,7 @@ def _dump_config(
         auto_configured_keys: Keys for which a commented out auto configuration section
                               needs to be added to the config file.
     """
+
     config_as_expected = _is_config_file_as_expected(
         config_file_path, missing_keys, auto_configured_keys
     )
diff --git a/rasa/importers/importer.py b/rasa/importers/importer.py
index 9724779457b9..83d7fd3be45b 100644
--- a/rasa/importers/importer.py
+++ b/rasa/importers/importer.py
@@ -4,7 +4,6 @@
 import logging
 
 from rasa.core.domain import Domain
-from rasa.core.interpreter import RegexInterpreter, NaturalLanguageInterpreter
 from rasa.core.training.structures import StoryGraph
 from rasa.nlu.training_data import TrainingData
 import rasa.utils.io as io_utils
@@ -20,13 +19,12 @@ async def get_domain(self) -> Domain:
         """Retrieves the domain of the bot.
 
         Returns:
-            Loaded ``Domain``.
+            Loaded `Domain`.
         """
         raise NotImplementedError()
 
     async def get_stories(
         self,
-        interpreter: "NaturalLanguageInterpreter" = RegexInterpreter(),
         template_variables: Optional[Dict] = None,
         use_e2e: bool = False,
         exclusion_percentage: Optional[int] = None,
@@ -34,15 +32,13 @@ async def get_stories(
         """Retrieves the stories that should be used for training.
 
         Args:
-            interpreter: Interpreter that should be used to parse end to
-                         end learning annotations.
             template_variables: Values of templates that should be replaced while
                                 reading the story files.
             use_e2e: Specifies whether to parse end to end learning annotations.
             exclusion_percentage: Amount of training data that should be excluded.
 
         Returns:
-            ``StoryGraph`` containing all loaded stories.
+            `StoryGraph` containing all loaded stories.
         """
 
         raise NotImplementedError()
@@ -63,7 +59,7 @@ async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData:
             language: Can be used to only load training data for a certain language.
 
         Returns:
-            Loaded NLU ``TrainingData``.
+            Loaded NLU `TrainingData`.
         """
 
         raise NotImplementedError()
@@ -74,7 +70,7 @@ def load_from_config(
         domain_path: Optional[Text] = None,
         training_data_paths: Optional[List[Text]] = None,
     ) -> "TrainingDataImporter":
-        """Loads a ``TrainingDataImporter`` instance from a configuration file."""
+        """Loads a `TrainingDataImporter` instance from a configuration file."""
 
         config = io_utils.read_config_file(config_path)
         return TrainingDataImporter.load_from_dict(
@@ -87,8 +83,9 @@ def load_core_importer_from_config(
         domain_path: Optional[Text] = None,
         training_data_paths: Optional[List[Text]] = None,
     ) -> "TrainingDataImporter":
-        """Loads a ``TrainingDataImporter`` instance from a configuration file that
-           only reads Core training data.
+        """Loads core `TrainingDataImporter` instance.
+
+        Instance loaded from configuration file will only read Core training data.
         """
 
         importer = TrainingDataImporter.load_from_config(
@@ -103,8 +100,9 @@ def load_nlu_importer_from_config(
         domain_path: Optional[Text] = None,
         training_data_paths: Optional[List[Text]] = None,
     ) -> "TrainingDataImporter":
-        """Loads a ``TrainingDataImporter`` instance from a configuration file that
-           only reads NLU training data.
+        """Loads nlu `TrainingDataImporter` instance.
+
+        Instance loaded from configuration file will only read NLU training data.
         """
 
         importer = TrainingDataImporter.load_from_config(
@@ -120,7 +118,7 @@ def load_from_dict(
         domain_path: Optional[Text] = None,
         training_data_paths: Optional[List[Text]] = None,
     ) -> "TrainingDataImporter":
-        """Loads a ``TrainingDataImporter`` instance from a dictionary."""
+        """Loads a `TrainingDataImporter` instance from a dictionary."""
 
         from rasa.importers.rasa import RasaFileImporter
 
@@ -182,7 +180,6 @@ async def get_domain(self) -> Domain:
 
     async def get_stories(
         self,
-        interpreter: "NaturalLanguageInterpreter" = RegexInterpreter(),
         template_variables: Optional[Dict] = None,
         use_e2e: bool = False,
         exclusion_percentage: Optional[int] = None,
@@ -207,13 +204,12 @@ async def get_domain(self) -> Domain:
 
     async def get_stories(
         self,
-        interpreter: "NaturalLanguageInterpreter" = RegexInterpreter(),
         template_variables: Optional[Dict] = None,
         use_e2e: bool = False,
         exclusion_percentage: Optional[int] = None,
     ) -> StoryGraph:
         return await self._importer.get_stories(
-            interpreter, template_variables, use_e2e, exclusion_percentage
+            template_variables, use_e2e, exclusion_percentage
         )
 
     async def get_config(self) -> Dict:
@@ -224,8 +220,10 @@ async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData:
 
 
 class CombinedDataImporter(TrainingDataImporter):
-    """A ``TrainingDataImporter`` that supports using multiple ``TrainingDataImporter``s as
-        if they were a single instance.
+    """A `TrainingDataImporter` that combines multiple importers.
+
+    Uses multiple `TrainingDataImporter` instances
+    to load the data as if they were a single instance.
     """
 
     def __init__(self, importers: List[TrainingDataImporter]):
@@ -247,15 +245,12 @@ async def get_domain(self) -> Domain:
 
     async def get_stories(
         self,
-        interpreter: "NaturalLanguageInterpreter" = RegexInterpreter(),
         template_variables: Optional[Dict] = None,
         use_e2e: bool = False,
         exclusion_percentage: Optional[int] = None,
     ) -> StoryGraph:
         stories = [
-            importer.get_stories(
-                interpreter, template_variables, use_e2e, exclusion_percentage
-            )
+            importer.get_stories(template_variables, use_e2e, exclusion_percentage)
             for importer in self._importers
         ]
         stories = await asyncio.gather(*stories)
diff --git a/rasa/importers/multi_project.py b/rasa/importers/multi_project.py
index 17dff5af01af..ec1d2c1eebd4 100644
--- a/rasa/importers/multi_project.py
+++ b/rasa/importers/multi_project.py
@@ -6,7 +6,6 @@
 from rasa import data
 import rasa.utils.io as io_utils
 from rasa.core.domain import Domain
-from rasa.core.interpreter import RegexInterpreter, NaturalLanguageInterpreter
 from rasa.importers.importer import TrainingDataImporter
 from rasa.importers import utils
 from rasa.nlu.training_data import TrainingData
@@ -38,9 +37,8 @@ def __init__(
 
         self._init_from_dict(self.config, self._project_directory)
 
-        extra_story_files, extra_nlu_files = data.get_core_nlu_files(
-            training_data_paths
-        )
+        extra_nlu_files = data.get_data_files(training_data_paths, data.is_nlu_file)
+        extra_story_files = data.get_data_files(training_data_paths, data.is_story_file)
         self._story_paths += extra_story_files
         self._nlu_paths += extra_nlu_files
 
@@ -95,7 +93,7 @@ def _init_from_directory(self, path: Text):
                     # Check next file
                     continue
 
-                if data.is_end_to_end_conversation_test_file(full_path):
+                if data.is_test_stories_file(full_path):
                     self._e2e_story_paths.append(full_path)
                 elif Domain.is_domain_file(full_path):
                     self._domain_paths.append(full_path)
@@ -173,7 +171,6 @@ async def get_domain(self) -> Domain:
 
     async def get_stories(
         self,
-        interpreter: "NaturalLanguageInterpreter" = RegexInterpreter(),
         template_variables: Optional[Dict] = None,
         use_e2e: bool = False,
         exclusion_percentage: Optional[int] = None,
@@ -183,7 +180,6 @@ async def get_stories(
         return await utils.story_graph_from_paths(
             story_paths,
             await self.get_domain(),
-            interpreter,
             template_variables,
             use_e2e,
             exclusion_percentage,
diff --git a/rasa/importers/rasa.py b/rasa/importers/rasa.py
index 0a594843b0f0..17a524c09e75 100644
--- a/rasa/importers/rasa.py
+++ b/rasa/importers/rasa.py
@@ -3,7 +3,6 @@
 
 from rasa import data
 from rasa.core.domain import Domain, InvalidDomain
-from rasa.core.interpreter import NaturalLanguageInterpreter, RegexInterpreter
 from rasa.core.training.structures import StoryGraph
 from rasa.importers import utils, autoconfig
 from rasa.importers.importer import TrainingDataImporter
@@ -25,9 +24,8 @@ def __init__(
 
         self._domain_path = domain_path
 
-        self._story_files, self._nlu_files = data.get_core_nlu_files(
-            training_data_paths
-        )
+        self._nlu_files = data.get_data_files(training_data_paths, data.is_nlu_file)
+        self._story_files = data.get_data_files(training_data_paths, data.is_story_file)
 
         self.config = autoconfig.get_configuration(config_file)
 
@@ -36,7 +34,6 @@ async def get_config(self) -> Dict:
 
     async def get_stories(
         self,
-        interpreter: "NaturalLanguageInterpreter" = RegexInterpreter(),
         template_variables: Optional[Dict] = None,
         use_e2e: bool = False,
         exclusion_percentage: Optional[int] = None,
@@ -45,7 +42,6 @@ async def get_stories(
         return await utils.story_graph_from_paths(
             self._story_files,
             await self.get_domain(),
-            interpreter,
             template_variables,
             use_e2e,
             exclusion_percentage,
diff --git a/rasa/importers/utils.py b/rasa/importers/utils.py
index 3e4a603cc61c..a39e3c3afc42 100644
--- a/rasa/importers/utils.py
+++ b/rasa/importers/utils.py
@@ -1,7 +1,6 @@
 from typing import Iterable, Text, Optional, Dict, List
 
 from rasa.core.domain import Domain
-from rasa.core.interpreter import NaturalLanguageInterpreter, RegexInterpreter
 from rasa.core.training.structures import StoryGraph
 from rasa.nlu.training_data import TrainingData
 
@@ -16,7 +15,6 @@ def training_data_from_paths(paths: Iterable[Text], language: Text) -> TrainingD
 async def story_graph_from_paths(
     files: List[Text],
     domain: Domain,
-    interpreter: NaturalLanguageInterpreter = RegexInterpreter(),
     template_variables: Optional[Dict] = None,
     use_e2e: bool = False,
     exclusion_percentage: Optional[int] = None,
@@ -25,6 +23,6 @@ async def story_graph_from_paths(
     from rasa.core.training import loading
 
     story_steps = await loading.load_data_from_files(
-        files, domain, interpreter, template_variables, use_e2e, exclusion_percentage
+        files, domain, template_variables, use_e2e, exclusion_percentage
     )
     return StoryGraph(story_steps)
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index ca269a4b49c6..639146313dca 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -353,7 +353,7 @@ def plot_entity_confidences(
         for target, prediction, confidence in zip(
             merged_targets, merged_predictions, merged_confidences
         )
-        if prediction != NO_ENTITY and target != prediction
+        if prediction not in (NO_ENTITY, target)
     ]
 
     plot_utils.plot_histogram([pos_hist, neg_hist], title, hist_filename)
@@ -608,17 +608,13 @@ def evaluate_intents(
         if isinstance(report, str):
             log_evaluation_table(report, precision, f1, accuracy)
 
-    if successes:
-        successes_filename = "intent_successes.json"
-        if output_directory:
-            successes_filename = os.path.join(output_directory, successes_filename)
+    if successes and output_directory:
+        successes_filename = os.path.join(output_directory, "intent_successes.json")
         # save classified samples to file for debugging
         write_intent_successes(intent_results, successes_filename)
 
-    if errors:
-        errors_filename = "intent_errors.json"
-        if output_directory:
-            errors_filename = os.path.join(output_directory, errors_filename)
+    if errors and output_directory:
+        errors_filename = os.path.join(output_directory, "intent_errors.json")
         # log and save misclassified samples to file for debugging
         write_intent_errors(intent_results, errors_filename)
 
@@ -1505,7 +1501,7 @@ def run_evaluation(
             disable_plotting,
         )
 
-    if entity_results:
+    if any(entity_results):
         logger.info("Entity evaluation results:")
         extractors = get_entity_extractors(interpreter)
         result["entity_evaluation"] = evaluate_entities(
@@ -1618,6 +1614,7 @@ def _contains_entity_labels(entity_results: List[EntityEvaluationResult]) -> boo
     for result in entity_results:
         if result.entity_targets or result.entity_predictions:
             return True
+    return False
 
 
 def cross_validate(
@@ -1645,7 +1642,6 @@ def cross_validate(
               corresponds to the relevant result for one fold
     """
     import rasa.nlu.config
-    from collections import defaultdict
 
     if isinstance(nlu_config, str):
         nlu_config = rasa.nlu.config.load(nlu_config)
@@ -1883,7 +1879,9 @@ def compare_nlu(
                         model_output_path,
                         fixed_model_name=model_name,
                     )
-                except Exception as e:
+                except Exception as e:  # skipcq: PYL-W0703
+                    # general exception catching needed to continue evaluating other
+                    # model configurations
                     logger.warning(f"Training model '{model_name}' failed. Error: {e}")
                     f_score_results[model_name][run].append(0.0)
                     continue
diff --git a/rasa/nlu/training_data/entities_parser.py b/rasa/nlu/training_data/entities_parser.py
index d5293896f3f1..02b3422a70d4 100644
--- a/rasa/nlu/training_data/entities_parser.py
+++ b/rasa/nlu/training_data/entities_parser.py
@@ -9,6 +9,7 @@
     ENTITY_ATTRIBUTE_ROLE,
     ENTITY_ATTRIBUTE_VALUE,
 )
+from rasa.nlu.training_data.message import Message
 from rasa.utils.common import raise_warning
 
 GROUP_ENTITY_VALUE = "value"
@@ -165,3 +166,12 @@ def replace_entities(training_example: Text) -> Text:
     return re.sub(
         ENTITY_REGEX, lambda m: m.groupdict()[GROUP_ENTITY_TEXT], training_example
     )
+
+
+def parse_training_example(example: Text, intent: Optional[Text] = None) -> "Message":
+    """Extract entities and synonyms, and convert to plain text."""
+
+    entities = find_entities_in_training_example(example)
+    plain_text = replace_entities(example)
+
+    return Message.build(plain_text, intent, entities)
diff --git a/rasa/nlu/training_data/formats/markdown.py b/rasa/nlu/training_data/formats/markdown.py
index 706d22555732..81ac53dfd3a5 100644
--- a/rasa/nlu/training_data/formats/markdown.py
+++ b/rasa/nlu/training_data/formats/markdown.py
@@ -101,12 +101,18 @@ def _parse_item(self, line: Text) -> None:
         """Parses an md list item line based on the current section type."""
         import rasa.nlu.training_data.lookup_tables_parser as lookup_tables_parser
         import rasa.nlu.training_data.synonyms_parser as synonyms_parser
+        from rasa.nlu.training_data import entities_parser
 
         match = re.match(item_regex, line)
         if match:
             item = match.group(1)
             if self.current_section == INTENT:
-                parsed = self.parse_training_example(item)
+                parsed = entities_parser.parse_training_example(
+                    item, self.current_title
+                )
+                synonyms_parser.add_synonyms_from_entities(
+                    parsed.text, parsed.get("entities", []), self.entity_synonyms
+                )
                 self.training_examples.append(parsed)
             elif self.current_section == SYNONYM:
                 synonyms_parser.add_synonym(
@@ -158,24 +164,6 @@ def _get_validated_dict(json_str: Text) -> Dict[Text, Text]:
 
         return data
 
-    def parse_training_example(self, example: Text) -> "Message":
-        """Extract entities and synonyms, and convert to plain text."""
-        from rasa.nlu.training_data import Message
-        import rasa.nlu.training_data.entities_parser as entities_parser
-        import rasa.nlu.training_data.synonyms_parser as synonyms_parser
-
-        entities = entities_parser.find_entities_in_training_example(example)
-        plain_text = entities_parser.replace_entities(example)
-        synonyms_parser.add_synonyms_from_entities(
-            plain_text, entities, self.entity_synonyms
-        )
-
-        message = Message.build(plain_text, self.current_title)
-
-        if len(entities) > 0:
-            message.set("entities", entities)
-        return message
-
     def _set_current_section(self, section: Text, title: Text) -> None:
         """Update parsing mode."""
         if section not in AVAILABLE_SECTIONS:
diff --git a/rasa/nlu/training_data/formats/rasa_yaml.py b/rasa/nlu/training_data/formats/rasa_yaml.py
index 1a4544d64130..d78a4104d2fb 100644
--- a/rasa/nlu/training_data/formats/rasa_yaml.py
+++ b/rasa/nlu/training_data/formats/rasa_yaml.py
@@ -13,6 +13,7 @@
     Optional,
 )
 
+from rasa import data
 from rasa.utils import validation
 from ruamel.yaml import YAMLError, StringIO
 
@@ -21,7 +22,6 @@
     DOCS_URL_TRAINING_DATA_NLU,
     LATEST_TRAINING_DATA_FORMAT_VERSION,
 )
-from rasa.data import YAML_FILE_EXTENSIONS
 from rasa.nlu.training_data.formats.readerwriter import (
     TrainingDataReader,
     TrainingDataWriter,
@@ -145,13 +145,13 @@ def _parse_nlu(self, nlu_data: Optional[List[Dict[Text, Any]]]) -> None:
                     docs=DOCS_URL_TRAINING_DATA_NLU,
                 )
 
-    def _parse_intent(self, data: Dict[Text, Any]) -> None:
+    def _parse_intent(self, intent_data: Dict[Text, Any]) -> None:
         from rasa.nlu.training_data import Message
         import rasa.nlu.training_data.entities_parser as entities_parser
         import rasa.nlu.training_data.synonyms_parser as synonyms_parser
         import rasa.nlu.constants as nlu_constants
 
-        intent = data.get(KEY_INTENT, "")
+        intent = intent_data.get(KEY_INTENT, "")
         if not intent:
             raise_warning(
                 f"Issue found while processing '{self.filename}': "
@@ -162,7 +162,7 @@ def _parse_intent(self, data: Dict[Text, Any]) -> None:
             )
             return
 
-        examples = data.get(KEY_INTENT_EXAMPLES, "")
+        examples = intent_data.get(KEY_INTENT_EXAMPLES, "")
         for example, entities in self._parse_training_examples(examples, intent):
 
             plain_text = entities_parser.replace_entities(example)
@@ -349,7 +349,7 @@ def is_yaml_nlu_file(filename: Text) -> bool:
             `True` if the `filename` is possibly a valid YAML NLU file,
             `False` otherwise.
         """
-        if Path(filename).suffix not in YAML_FILE_EXTENSIONS:
+        if not data.is_likely_yaml_file(filename):
             return False
 
         try:
diff --git a/rasa/nlu/training_data/formats/readerwriter.py b/rasa/nlu/training_data/formats/readerwriter.py
index d8763bfc676c..1816b7e24bb7 100644
--- a/rasa/nlu/training_data/formats/readerwriter.py
+++ b/rasa/nlu/training_data/formats/readerwriter.py
@@ -117,20 +117,20 @@ def generate_entity(text: Text, entity: Dict[Text, Any]) -> Text:
 
         if use_short_syntax:
             return f"[{entity_text}]({entity_type})"
-
-        entity_dict = OrderedDict(
-            [
-                (ENTITY_ATTRIBUTE_TYPE, entity_type),
-                (ENTITY_ATTRIBUTE_ROLE, entity_role),
-                (ENTITY_ATTRIBUTE_GROUP, entity_group),
-                (ENTITY_ATTRIBUTE_VALUE, entity_value),
-            ]
-        )
-        entity_dict = OrderedDict(
-            [(k, v) for k, v in entity_dict.items() if v is not None]
-        )
-
-        return f"[{entity_text}]{json.dumps(entity_dict)}"
+        else:
+            entity_dict = OrderedDict(
+                [
+                    (ENTITY_ATTRIBUTE_TYPE, entity_type),
+                    (ENTITY_ATTRIBUTE_ROLE, entity_role),
+                    (ENTITY_ATTRIBUTE_GROUP, entity_group),
+                    (ENTITY_ATTRIBUTE_VALUE, entity_value),
+                ]
+            )
+            entity_dict = OrderedDict(
+                [(k, v) for k, v in entity_dict.items() if v is not None]
+            )
+
+            return f"[{entity_text}]{json.dumps(entity_dict)}"
 
 
 class JsonTrainingDataReader(TrainingDataReader):
diff --git a/rasa/nlu/training_data/synonyms_parser.py b/rasa/nlu/training_data/synonyms_parser.py
index 5d8aa1459c48..89744419c29a 100644
--- a/rasa/nlu/training_data/synonyms_parser.py
+++ b/rasa/nlu/training_data/synonyms_parser.py
@@ -1,4 +1,4 @@
-from typing import Text, List, Dict
+from typing import Any, Text, List, Dict
 
 from rasa.nlu.constants import (
     ENTITY_ATTRIBUTE_VALUE,
@@ -8,7 +8,7 @@
 
 
 def add_synonyms_from_entities(
-    plain_text: Text, entities: List[Dict], existing_synonyms: Dict
+    plain_text: Text, entities: List[Dict], existing_synonyms: Dict[Text, Any]
 ) -> None:
     """Adds synonyms found in intent examples.
 
@@ -25,7 +25,7 @@ def add_synonyms_from_entities(
 
 
 def add_synonym(
-    synonym_value: Text, synonym_name: Text, existing_synonyms: Dict
+    synonym_value: Text, synonym_name: Text, existing_synonyms: Dict[Text, Any]
 ) -> None:
     """Adds a new synonym mapping to the provided list of synonyms.
 
diff --git a/rasa/nlu/training_data/training_data.py b/rasa/nlu/training_data/training_data.py
index fef7e7d6e17b..ee57f0ae8142 100644
--- a/rasa/nlu/training_data/training_data.py
+++ b/rasa/nlu/training_data/training_data.py
@@ -7,11 +7,7 @@
 from os.path import relpath
 from typing import Any, Dict, List, Optional, Set, Text, Tuple, Callable
 
-from rasa.data import (
-    JSON_FILE_EXTENSIONS,
-    MARKDOWN_FILE_EXTENSIONS,
-    YAML_FILE_EXTENSIONS,
-)
+from rasa import data
 import rasa.nlu.utils
 from rasa.utils.common import raise_warning, lazy_property
 from rasa.nlu.constants import (
@@ -309,11 +305,11 @@ def nlu_as_yaml(self) -> Text:
 
     def persist_nlu(self, filename: Text = DEFAULT_TRAINING_DATA_OUTPUT_PATH) -> None:
 
-        if Path(filename).suffix in JSON_FILE_EXTENSIONS:
+        if data.is_likely_json_file(filename):
             rasa.nlu.utils.write_to_file(filename, self.nlu_as_json(indent=2))
-        elif Path(filename).suffix in MARKDOWN_FILE_EXTENSIONS:
+        elif data.is_likely_markdown_file(filename):
             rasa.nlu.utils.write_to_file(filename, self.nlu_as_markdown())
-        elif Path(filename).suffix in YAML_FILE_EXTENSIONS:
+        elif data.is_likely_yaml_file(filename):
             rasa.nlu.utils.write_to_file(filename, self.nlu_as_yaml())
         else:
             ValueError(
@@ -322,9 +318,9 @@ def persist_nlu(self, filename: Text = DEFAULT_TRAINING_DATA_OUTPUT_PATH) -> Non
             )
 
     def persist_nlg(self, filename: Text) -> None:
-        if Path(filename).suffix in YAML_FILE_EXTENSIONS:
+        if data.is_likely_yaml_file(filename):
             rasa.nlu.utils.write_to_file(filename, self.nlg_as_yaml())
-        elif Path(filename).suffix in MARKDOWN_FILE_EXTENSIONS:
+        elif data.is_likely_markdown_file(filename):
             nlg_serialized_data = self.nlg_as_markdown()
             if nlg_serialized_data:
                 rasa.nlu.utils.write_to_file(filename, nlg_serialized_data)
@@ -338,7 +334,7 @@ def persist_nlg(self, filename: Text) -> None:
     def get_nlg_persist_filename(nlu_filename: Text) -> Text:
 
         extension = Path(nlu_filename).suffix
-        if extension in JSON_FILE_EXTENSIONS:
+        if data.is_likely_json_file(nlu_filename):
             # backwards compatibility: previously NLG was always dumped as md. now
             # we are going to dump in the same format as the NLU data. unfortunately
             # there is a special case: NLU is in json format, in this case we use
diff --git a/rasa/server.py b/rasa/server.py
index 164c1a11dbcc..ffc4d9e7b0b1 100644
--- a/rasa/server.py
+++ b/rasa/server.py
@@ -3,7 +3,6 @@
 import logging
 import multiprocessing
 import os
-from pathlib import Path
 import tempfile
 import traceback
 import typing
@@ -12,8 +11,7 @@
 from pathlib import Path
 from typing import Any, Callable, List, Optional, Text, Union, Dict
 
-from sanic.exceptions import InvalidUsage
-
+from rasa.core.training.story_writer.yaml_story_writer import YAMLStoryWriter
 from rasa.nlu.training_data.formats import RasaYAMLReader
 import rasa
 import rasa.core.utils
@@ -29,7 +27,6 @@
     MINIMUM_COMPATIBLE_VERSION,
     DOCS_URL_TRAINING_DATA_NLU,
 )
-from rasa.core import agent
 from rasa.core.agent import Agent
 from rasa.core.brokers.broker import EventBroker
 from rasa.core.channels.channel import (
@@ -343,7 +340,7 @@ async def _load_agent(
             if not lock_store:
                 lock_store = LockStore.create(endpoints.lock_store)
 
-        loaded_agent = await agent.load_agent(
+        loaded_agent = await rasa.core.agent.load_agent(
             model_path,
             model_server,
             remote_storage,
@@ -583,7 +580,7 @@ async def retrieve_story(request: Request, conversation_id: Text):
                 tracker = tracker.travel_back_in_time(until_time)
 
             # dump and return tracker
-            state = tracker.export_stories(e2e=True)
+            state = YAMLStoryWriter().dumps(tracker.as_story().story_steps)
             return response.text(state)
         except Exception as e:
             logger.debug(traceback.format_exc())
@@ -830,7 +827,9 @@ async def evaluate_stories(request: Request) -> HTTPResponse:
         use_e2e = rasa.utils.endpoints.bool_arg(request, "e2e", default=False)
 
         try:
-            evaluation = await test(test_data, app.agent, e2e=use_e2e)
+            evaluation = await test(
+                test_data, app.agent, e2e=use_e2e, disable_plotting=True
+            )
             return response.json(evaluation)
         except Exception as e:
             logger.error(traceback.format_exc())
@@ -872,7 +871,7 @@ async def evaluate_intents(request: Request) -> HTTPResponse:
         _, nlu_model = model.get_model_subdirectories(model_directory)
 
         try:
-            evaluation = run_evaluation(data_path, nlu_model)
+            evaluation = run_evaluation(data_path, nlu_model, disable_plotting=True)
             return response.json(evaluation)
         except Exception as e:
             logger.error(traceback.format_exc())
diff --git a/rasa/test.py b/rasa/test.py
index 275f5646bdef..bdeee1ff796c 100644
--- a/rasa/test.py
+++ b/rasa/test.py
@@ -96,31 +96,26 @@ def test(
     model: Text,
     stories: Text,
     nlu_data: Text,
-    endpoints: Optional[Text] = None,
     output: Text = DEFAULT_RESULTS_PATH,
     additional_arguments: Optional[Dict] = None,
 ):
     if additional_arguments is None:
         additional_arguments = {}
 
-    test_core(model, stories, endpoints, output, additional_arguments)
+    test_core(model, stories, output, additional_arguments)
     test_nlu(model, nlu_data, output, additional_arguments)
 
 
 def test_core(
     model: Optional[Text] = None,
     stories: Optional[Text] = None,
-    endpoints: Optional[Text] = None,
     output: Text = DEFAULT_RESULTS_PATH,
     additional_arguments: Optional[Dict] = None,
-):
-    import rasa.core.utils as core_utils
+) -> None:
     import rasa.model
-    from rasa.core.interpreter import RegexInterpreter, NaturalLanguageInterpreter
+    from rasa.core.interpreter import RegexInterpreter
     from rasa.core.agent import Agent
 
-    _endpoints = core_utils.AvailableEndpoints.read_endpoints(endpoints)
-
     if additional_arguments is None:
         additional_arguments = {}
 
@@ -136,27 +131,22 @@ def test_core(
         )
         return
 
-    core_path, nlu_path = rasa.model.get_model_subdirectories(unpacked_model)
+    _agent = Agent.load(unpacked_model)
 
-    if not core_path:
+    if _agent.policy_ensemble is None:
         cli_utils.print_error(
             "Unable to test: could not find a Core model. Use 'rasa train' to train a "
             "Rasa model and provide it via the '--model' argument."
         )
 
-    use_e2e = additional_arguments.get("e2e", False)
-
-    _interpreter = RegexInterpreter()
-    if nlu_path:
-        _interpreter = NaturalLanguageInterpreter.create(_endpoints.nlu or nlu_path)
-    elif use_e2e:
+    if isinstance(_agent.interpreter, RegexInterpreter):
         cli_utils.print_warning(
             "No NLU model found. Using default 'RegexInterpreter' for end-to-end "
-            "evaluation."
+            "evaluation. If you added actual user messages to your test stories "
+            "this will likely lead to the tests failing. In that case, you need "
+            "to train a NLU model first, e.g. using `rasa train`."
         )
 
-    _agent = Agent.load(unpacked_model, interpreter=_interpreter)
-
     from rasa.core.test import test
 
     kwargs = utils.minimal_kwargs(additional_arguments, test, ["stories", "agent"])
diff --git a/rasa/utils/io.py b/rasa/utils/io.py
index d1a7743d1d3d..b2fd249b6943 100644
--- a/rasa/utils/io.py
+++ b/rasa/utils/io.py
@@ -245,6 +245,8 @@ def convert_to_ordered_dict(obj: Any) -> Any:
         An `OrderedDict` with all nested dictionaries converted if `obj` is a
         dictionary, otherwise the object itself.
     """
+    if isinstance(obj, OrderedDict):
+        return obj
     # use recursion on lists
     if isinstance(obj, list):
         return [convert_to_ordered_dict(element) for element in obj]
diff --git a/rasa/utils/plotting.py b/rasa/utils/plotting.py
index 6035464e1c55..1201dfed314a 100644
--- a/rasa/utils/plotting.py
+++ b/rasa/utils/plotting.py
@@ -12,18 +12,34 @@
 logger = logging.getLogger(__name__)
 
 
-# At first, matplotlib will be initialized with default OS-specific available backend
-# if that didn't happen, we'll try to set it up manually
-if matplotlib.get_backend() is not None:
-    pass
-else:  # pragma: no cover
-    try:
-        # If the `tkinter` package is available, we can use the `TkAgg` backend
-        import tkinter
-
-        matplotlib.use("TkAgg")
-    except ImportError:
-        matplotlib.use("agg")
+def _fix_matplotlib_backend() -> None:
+    """Tries to fix a broken matplotlib backend..."""
+    # At first, matplotlib will be initialized with default OS-specific
+    # available backend
+    if matplotlib.get_backend() == "TkAgg":
+        try:
+            # on OSX sometimes the tkinter package is broken and can't be imported.
+            # we'll try to import it and if it fails we will use a different backend
+            import tkinter  # skipcq: PYL-W0611
+        except (ImportError, ModuleNotFoundError):
+            logger.debug("Setting matplotlib backend to 'agg'")
+            matplotlib.use("agg")
+
+    # if no backend is set by default, we'll try to set it up manually
+    elif matplotlib.get_backend() is None:  # pragma: no cover
+        try:
+            # If the `tkinter` package is available, we can use the `TkAgg` backend
+            import tkinter  # skipcq: PYL-W0611
+
+            logger.debug("Setting matplotlib backend to 'TkAgg'")
+            matplotlib.use("TkAgg")
+        except (ImportError, ModuleNotFoundError):
+            logger.debug("Setting matplotlib backend to 'agg'")
+            matplotlib.use("agg")
+
+
+# we call the fix as soon as this package gets imported
+_fix_matplotlib_backend()
 
 
 def plot_confusion_matrix(
@@ -52,7 +68,7 @@ def plot_confusion_matrix(
     import matplotlib.pyplot as plt
     from matplotlib.colors import LogNorm
 
-    zmax = confusion_matrix.max()
+    zmax = confusion_matrix.max() if len(confusion_matrix) > 0 else 1
     plt.clf()
     if not color_map:
         color_map = plt.cm.Blues
@@ -78,7 +94,7 @@ def plot_confusion_matrix(
     else:
         logger.info(f"Confusion matrix, without normalization: \n{confusion_matrix}")
 
-    thresh = confusion_matrix.max() / 2.0
+    thresh = zmax / 2.0
     for i, j in itertools.product(
         range(confusion_matrix.shape[0]), range(confusion_matrix.shape[1])
     ):
diff --git a/tests/cli/test_rasa_export.py b/tests/cli/test_rasa_export.py
index 2a5e0380a919..c7a2fafef278 100644
--- a/tests/cli/test_rasa_export.py
+++ b/tests/cli/test_rasa_export.py
@@ -6,6 +6,7 @@
 import pytest
 from _pytest.monkeypatch import MonkeyPatch
 from _pytest.pytester import RunResult
+from ruamel.yaml.scalarstring import SingleQuotedScalarString
 
 import rasa.core.utils as rasa_core_utils
 from rasa.cli import export
@@ -65,7 +66,14 @@ def test_validate_timestamp_options_with_invalid_timestamps():
 def test_get_event_broker_and_tracker_store_from_endpoint_config(tmp_path: Path):
     # write valid config to file
     endpoints_path = write_endpoint_config_to_yaml(
-        tmp_path, {"event_broker": {"type": "sql"}, "tracker_store": {"type": "sql"}}
+        tmp_path,
+        {
+            "event_broker": {
+                "type": "sql",
+                "db": str(tmp_path / "rasa.db").replace("\\", "\\\\"),
+            },
+            "tracker_store": {"type": "sql"},
+        },
     )
 
     available_endpoints = rasa_core_utils.read_endpoints_from_path(endpoints_path)
diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
index 394475806f7e..5884d6a94246 100644
--- a/tests/cli/test_rasa_test.py
+++ b/tests/cli/test_rasa_test.py
@@ -21,6 +21,14 @@ def test_test_core_no_plot(run_in_simple_project: Callable[..., RunResult]):
 
 
 def test_test(run_in_simple_project_with_model: Callable[..., RunResult]):
+    write_yaml(
+        {
+            "pipeline": "KeywordIntentClassifier",
+            "policies": [{"name": "MemoizationPolicy"}],
+        },
+        "config2.yml",
+    )
+
     run_in_simple_project_with_model("test")
 
     assert os.path.exists("results")
@@ -61,14 +69,15 @@ def test_test_nlu_cross_validation(run_in_simple_project: Callable[..., RunResul
 
 
 def test_test_nlu_comparison(run_in_simple_project: Callable[..., RunResult]):
-    copyfile("config.yml", "config-1.yml")
+    write_yaml({"pipeline": "KeywordIntentClassifier"}, "config.yml")
+    write_yaml({"pipeline": "KeywordIntentClassifier"}, "config2.yml")
 
     run_in_simple_project(
         "test",
         "nlu",
         "--config",
         "config.yml",
-        "config-1.yml",
+        "config2.yml",
         "--run",
         "2",
         "--percentages",
@@ -123,8 +132,6 @@ def test_test_core_comparison_after_train(
         "--percentages",
         "25",
         "75",
-        "--augmentation",
-        "5",
         "--out",
         "comparison_models",
     )
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
index 6257da93f10f..239c27e6b1fa 100644
--- a/tests/cli/test_utils.py
+++ b/tests/cli/test_utils.py
@@ -72,30 +72,29 @@ def test_validate_invalid_path():
         get_validated_path("test test test", "out", "default")
 
 
-def test_validate_valid_path():
-    tempdir = tempfile.mkdtemp()
-
-    assert get_validated_path(tempdir, "out", "default") == tempdir
+def test_validate_valid_path(tmp_path: pathlib.Path):
+    assert get_validated_path(str(tmp_path), "out", "default") == str(tmp_path)
 
 
 def test_validate_if_none_is_valid():
     assert get_validated_path(None, "out", "default", True) is None
 
 
-def test_validate_with_none_if_default_is_valid(caplog: LogCaptureFixture):
-    tempdir = tempfile.mkdtemp()
-
+def test_validate_with_none_if_default_is_valid(
+    caplog: LogCaptureFixture, tmp_path: pathlib.Path
+):
     with caplog.at_level(logging.WARNING, rasa.cli.utils.logger.name):
-        assert get_validated_path(None, "out", tempdir) == tempdir
+        assert get_validated_path(None, "out", str(tmp_path)) == str(tmp_path)
 
     assert caplog.records == []
 
 
-def test_validate_with_invalid_directory_if_default_is_valid():
-    tempdir = tempfile.mkdtemp()
+def test_validate_with_invalid_directory_if_default_is_valid(tmp_path: pathlib.Path):
     invalid_directory = "gcfhvjkb"
     with pytest.warns(UserWarning) as record:
-        assert get_validated_path(invalid_directory, "out", tempdir) == tempdir
+        assert get_validated_path(invalid_directory, "out", str(tmp_path)) == str(
+            tmp_path
+        )
     assert len(record) == 1
     assert "does not seem to exist" in record[0].message.args[0]
 
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
index b3e08caaa91f..dba0e8f308e0 100644
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
@@ -226,7 +226,7 @@ def project() -> Text:
 
 
 @pytest.fixture
-async def form_bot_agent(trained_async, tmpdir_factory) -> Agent:
+async def form_bot_agent(trained_async) -> Agent:
     zipped_model = await trained_async(
         domain="examples/formbot/domain.yml",
         config="examples/formbot/config.yml",
diff --git a/tests/core/test_broker.py b/tests/core/test_broker.py
index a3008c3803b1..bc0ef14ae537 100644
--- a/tests/core/test_broker.py
+++ b/tests/core/test_broker.py
@@ -1,6 +1,7 @@
 import json
 import logging
 from pathlib import Path
+import textwrap
 
 from typing import Union, Text, List, Optional, Type
 
@@ -9,6 +10,7 @@
 
 from _pytest.monkeypatch import MonkeyPatch
 
+import rasa.utils.io
 from rasa.core.brokers.broker import EventBroker
 from rasa.core.brokers.file import FileEventBroker
 from rasa.core.brokers.kafka import KafkaEventBroker
@@ -117,14 +119,23 @@ def test_sql_broker_logs_to_sql_db():
     assert events_types == ["user", "slot", "restart"]
 
 
-def test_file_broker_from_config():
-    cfg = read_endpoint_config(
-        "data/test_endpoints/event_brokers/file_endpoint.yml", "event_broker"
+def test_file_broker_from_config(tmp_path: Path):
+    # backslashes need to be encoded (windows...) otherwise we run into unicode issues
+    path = str(tmp_path / "rasa_test_event.log").replace("\\", "\\\\")
+    endpoint_config = textwrap.dedent(
+        f"""
+        event_broker:
+          path: "{path}"
+          type: "file"
+    """
     )
+    rasa.utils.io.write_text_file(endpoint_config, tmp_path / "endpoint.yml")
+
+    cfg = read_endpoint_config(str(tmp_path / "endpoint.yml"), "event_broker")
     actual = EventBroker.create(cfg)
 
     assert isinstance(actual, FileEventBroker)
-    assert actual.path == "rasa_event.log"
+    assert actual.path.endswith("rasa_test_event.log")
 
 
 def test_file_broker_logs_to_file(tmp_path: Path):
@@ -166,8 +177,13 @@ def test_file_broker_properly_logs_newlines(tmp_path):
     assert recovered == [event_with_newline]
 
 
-def test_load_custom_broker_name():
-    config = EndpointConfig(**{"type": "rasa.core.brokers.file.FileEventBroker"})
+def test_load_custom_broker_name(tmp_path: Path):
+    config = EndpointConfig(
+        **{
+            "type": "rasa.core.brokers.file.FileEventBroker",
+            "path": str(tmp_path / "rasa_event.log"),
+        }
+    )
     assert EventBroker.create(config)
 
 
@@ -209,12 +225,15 @@ def test_no_pika_logs_if_no_debug_mode(caplog: LogCaptureFixture):
 
 
 def test_pika_logs_in_debug_mode(caplog: LogCaptureFixture, monkeypatch: MonkeyPatch):
-    from rasa.core.brokers import pika
+    from rasa.core.brokers.pika import _pika_log_level
 
-    with caplog.at_level(logging.DEBUG):
-        with pytest.raises(Exception):
-            pika.initialise_pika_connection(
-                "localhost", "user", "password", connection_attempts=1
-            )
+    pika_level = logging.getLogger("pika").level
 
-    assert len(caplog.records) > 0
+    with caplog.at_level(logging.INFO):
+        with _pika_log_level(logging.CRITICAL):
+            assert logging.getLogger("pika").level == logging.CRITICAL
+
+    with caplog.at_level(logging.DEBUG):
+        with _pika_log_level(logging.CRITICAL):
+            # level should not change
+            assert logging.getLogger("pika").level == pika_level
diff --git a/tests/core/test_data.py b/tests/core/test_data.py
index 7c1b2405c162..783b3c110b0b 100644
--- a/tests/core/test_data.py
+++ b/tests/core/test_data.py
@@ -48,8 +48,8 @@ def test_get_nlu_file(project):
 
 def test_get_core_nlu_files(project):
     data_dir = os.path.join(project, "data")
-    core_files, nlu_files = data.get_core_nlu_files([data_dir])
-
+    nlu_files = data.get_data_files([data_dir], data.is_nlu_file)
+    core_files = data.get_data_files([data_dir], data.is_story_file)
     assert len(nlu_files) == 1
     assert list(nlu_files)[0].endswith("nlu.yml")
 
@@ -77,37 +77,37 @@ def test_get_core_nlu_directories(project):
 def test_get_core_nlu_directories_with_none():
     directories = data.get_core_nlu_directories(None)
 
-    assert all([directory for directory in directories])
-    assert all([not os.listdir(directory) for directory in directories])
+    assert all(directories)
+    assert all(not os.listdir(directory) for directory in directories)
 
 
-def test_same_file_names_get_resolved(tmpdir):
+def test_same_file_names_get_resolved(tmp_path):
     # makes sure the resolution properly handles if there are two files with
     # with the same name in different directories
 
-    tmpdir.join("one").mkdir()
-    tmpdir.join("two").mkdir()
-    data_dir_one = os.path.join(tmpdir.join("one").join("stories.md").strpath)
-    data_dir_two = os.path.join(tmpdir.join("two").join("stories.md").strpath)
+    (tmp_path / "one").mkdir()
+    (tmp_path / "two").mkdir()
+    data_dir_one = str(tmp_path / "one" / "stories.md")
+    data_dir_two = str(tmp_path / "two" / "stories.md")
     shutil.copy2(DEFAULT_STORIES_FILE, data_dir_one)
     shutil.copy2(DEFAULT_STORIES_FILE, data_dir_two)
 
-    nlu_dir_one = os.path.join(tmpdir.join("one").join("nlu.yml").strpath)
-    nlu_dir_two = os.path.join(tmpdir.join("two").join("nlu.yml").strpath)
+    nlu_dir_one = str(tmp_path / "one" / "nlu.yml")
+    nlu_dir_two = str(tmp_path / "two" / "nlu.yml")
     shutil.copy2(DEFAULT_NLU_DATA, nlu_dir_one)
     shutil.copy2(DEFAULT_NLU_DATA, nlu_dir_two)
 
-    core_directory, nlu_directory = data.get_core_nlu_directories([tmpdir.strpath])
+    core_directory, nlu_directory = data.get_core_nlu_directories([str(tmp_path)])
 
     nlu_files = os.listdir(nlu_directory)
 
     assert len(nlu_files) == 2
-    assert all([f.endswith("nlu.yml") for f in nlu_files])
+    assert all(f.endswith("nlu.yml") for f in nlu_files)
 
     stories = os.listdir(core_directory)
 
     assert len(stories) == 2
-    assert all([f.endswith("stories.md") for f in stories])
+    assert all(f.endswith("stories.md") for f in stories)
 
 
 @pytest.mark.parametrize(
@@ -162,7 +162,7 @@ def test_same_file_names_get_resolved(tmpdir):
 def test_find_nlu_files_with_different_formats(test_input, expected):
     examples_dir = "data/examples"
     data_dir = os.path.join(examples_dir, test_input)
-    core_files, nlu_files = data.get_core_nlu_files([data_dir])
+    nlu_files = data.get_data_files([data_dir], data.is_nlu_file)
     assert [Path(f) for f in nlu_files] == [Path(f) for f in expected]
 
 
@@ -193,5 +193,5 @@ def test_is_not_nlu_file_with_json():
 
 def test_get_story_file_with_yaml():
     examples_dir = "data/test_yaml_stories"
-    core_files, nlu_files = data.get_core_nlu_files([examples_dir])
+    core_files = data.get_data_files([examples_dir], data.is_story_file)
     assert core_files
diff --git a/tests/core/test_domain.py b/tests/core/test_domain.py
index 987497f1ee70..74ba595f5c23 100644
--- a/tests/core/test_domain.py
+++ b/tests/core/test_domain.py
@@ -505,17 +505,16 @@ def test_collect_intent_properties(intents, entities, intent_properties):
     assert Domain.collect_intent_properties(intents, entities) == intent_properties
 
 
-def test_load_domain_from_directory_tree(tmpdir_factory: TempdirFactory):
-    root = tmpdir_factory.mktemp("Parent Bot")
+def test_load_domain_from_directory_tree(tmp_path: Path):
     root_domain = {"actions": ["utter_root", "utter_root2"]}
-    utils.dump_obj_as_yaml_to_file(root / "domain_pt1.yml", root_domain)
+    utils.dump_obj_as_yaml_to_file(tmp_path / "domain_pt1.yml", root_domain)
 
-    subdirectory_1 = root / "Skill 1"
+    subdirectory_1 = tmp_path / "Skill 1"
     subdirectory_1.mkdir()
     skill_1_domain = {"actions": ["utter_skill_1"]}
     utils.dump_obj_as_yaml_to_file(subdirectory_1 / "domain_pt2.yml", skill_1_domain)
 
-    subdirectory_2 = root / "Skill 2"
+    subdirectory_2 = tmp_path / "Skill 2"
     subdirectory_2.mkdir()
     skill_2_domain = {"actions": ["utter_skill_2"]}
     utils.dump_obj_as_yaml_to_file(subdirectory_2 / "domain_pt3.yml", skill_2_domain)
@@ -528,7 +527,7 @@ def test_load_domain_from_directory_tree(tmpdir_factory: TempdirFactory):
         subsubdirectory / "domain_pt4.yaml", skill_2_1_domain
     )
 
-    actual = Domain.load(str(root))
+    actual = Domain.load(str(tmp_path))
     expected = [
         "utter_root",
         "utter_root2",
diff --git a/tests/core/test_dsl.py b/tests/core/test_dsl.py
deleted file mode 100644
index f04055439e42..000000000000
--- a/tests/core/test_dsl.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from typing import Text, Dict
-
-import pytest
-
-from rasa.core.events import UserUttered
-from rasa.core.training.dsl import EndToEndReader
-
-
-@pytest.mark.parametrize(
-    "line, expected",
-    [
-        (" greet: hi", {"intent": "greet", "true_intent": "greet", "text": "hi"}),
-        (
-            " greet: /greet",
-            {
-                "intent": "greet",
-                "true_intent": "greet",
-                "text": "/greet",
-                "entities": [],
-            },
-        ),
-        (
-            'greet: /greet{"test": "test"}',
-            {
-                "intent": "greet",
-                "entities": [
-                    {"entity": "test", "start": 6, "end": 22, "value": "test"}
-                ],
-                "true_intent": "greet",
-                "text": '/greet{"test": "test"}',
-            },
-        ),
-        (
-            'greet{"test": "test"}: /greet{"test": "test"}',
-            {
-                "intent": "greet",
-                "entities": [
-                    {"entity": "test", "start": 6, "end": 22, "value": "test"}
-                ],
-                "true_intent": "greet",
-                "text": '/greet{"test": "test"}',
-            },
-        ),
-        (
-            "mood_great: [great](feeling)",
-            {
-                "intent": "mood_great",
-                "entities": [
-                    {"start": 0, "end": 5, "value": "great", "entity": "feeling"}
-                ],
-                "true_intent": "mood_great",
-                "text": "great",
-            },
-        ),
-        (
-            'form: greet{"test": "test"}: /greet{"test": "test"}',
-            {
-                "intent": "greet",
-                "entities": [
-                    {"end": 22, "entity": "test", "start": 6, "value": "test"}
-                ],
-                "true_intent": "greet",
-                "text": '/greet{"test": "test"}',
-            },
-        ),
-    ],
-)
-def test_e2e_parsing(line: Text, expected: Dict):
-    reader = EndToEndReader()
-    actual = reader._parse_item(line)
-
-    assert actual.as_dict() == expected
-
-
-@pytest.mark.parametrize(
-    "parse_data, expected_story_string",
-    [
-        (
-            {
-                "text": "/simple",
-                "parse_data": {
-                    "intent": {"confidence": 1.0, "name": "simple"},
-                    "entities": [
-                        {"start": 0, "end": 5, "value": "great", "entity": "feeling"}
-                    ],
-                },
-            },
-            "simple: /simple",
-        ),
-        (
-            {
-                "text": "great",
-                "parse_data": {
-                    "intent": {"confidence": 1.0, "name": "simple"},
-                    "entities": [
-                        {"start": 0, "end": 5, "value": "great", "entity": "feeling"}
-                    ],
-                },
-            },
-            "simple: [great](feeling)",
-        ),
-        (
-            {
-                "text": "great",
-                "parse_data": {
-                    "intent": {"confidence": 1.0, "name": "simple"},
-                    "entities": [],
-                },
-            },
-            "simple: great",
-        ),
-    ],
-)
-def test_user_uttered_to_e2e(parse_data: Dict, expected_story_string: Text):
-    event = UserUttered.from_story_string("user", parse_data)[0]
-
-    assert isinstance(event, UserUttered)
-    assert event.as_story_string(e2e=True) == expected_story_string
-
-
-@pytest.mark.parametrize("line", [" greet{: hi"])
-def test_invalid_end_to_end_format(line: Text):
-    reader = EndToEndReader()
-
-    with pytest.raises(ValueError):
-        # noinspection PyProtectedMember
-        _ = reader._parse_item(line)
diff --git a/tests/core/test_evaluation.py b/tests/core/test_evaluation.py
index 89113d93e220..aa6b0d36f34e 100644
--- a/tests/core/test_evaluation.py
+++ b/tests/core/test_evaluation.py
@@ -57,7 +57,7 @@ async def test_end_to_end_evaluation_script(default_agent: Agent):
         END_TO_END_STORY_FILE, default_agent, use_e2e=True
     )
 
-    story_evaluation, num_stories = _collect_story_predictions(
+    story_evaluation, num_stories = await _collect_story_predictions(
         completed_trackers, default_agent, use_e2e=True
     )
 
@@ -94,7 +94,7 @@ async def test_end_to_end_evaluation_script_unknown_entity(default_agent: Agent)
         E2E_STORY_FILE_UNKNOWN_ENTITY, default_agent, use_e2e=True
     )
 
-    story_evaluation, num_stories = _collect_story_predictions(
+    story_evaluation, num_stories = await _collect_story_predictions(
         completed_trackers, default_agent, use_e2e=True
     )
 
@@ -108,7 +108,7 @@ async def test_end_to_evaluation_with_forms(form_bot_agent: Agent):
         "data/test_evaluations/form-end-to-end-stories.md", form_bot_agent, use_e2e=True
     )
 
-    story_evaluation, num_stories = _collect_story_predictions(
+    story_evaluation, num_stories = await _collect_story_predictions(
         test_stories, form_bot_agent, use_e2e=True
     )
 
@@ -129,7 +129,7 @@ async def test_source_in_failed_stories(tmpdir: Path, default_agent: Agent):
     failed_stories = rasa.utils.io.read_file(stories_path)
 
     assert (
-        f"## simple_story_with_unknown_entity ({E2E_STORY_FILE_UNKNOWN_ENTITY})"
+        f"story: simple_story_with_unknown_entity ({E2E_STORY_FILE_UNKNOWN_ENTITY})"
         in failed_stories
     )
 
@@ -146,7 +146,7 @@ async def test_end_to_evaluation_trips_circuit_breaker():
         E2E_STORY_FILE_TRIPS_CIRCUIT_BREAKER, agent, use_e2e=True
     )
 
-    story_evaluation, num_stories = _collect_story_predictions(
+    story_evaluation, num_stories = await _collect_story_predictions(
         test_stories, agent, use_e2e=True
     )
 
diff --git a/tests/core/test_events.py b/tests/core/test_events.py
index dcc34455a95d..0faf88f90a73 100644
--- a/tests/core/test_events.py
+++ b/tests/core/test_events.py
@@ -26,6 +26,7 @@
     UserUtteranceReverted,
     AgentUttered,
     SessionStarted,
+    md_format_message,
 )
 
 
@@ -300,3 +301,43 @@ def test_event_default_metadata(event_class: Type[Event]):
         assert event.as_dict()["metadata"] == {}
     else:
         assert "metadata" not in event.as_dict()
+
+
+def test_md_format_message():
+    assert (
+        md_format_message("Hello there!", intent="greet", entities=[]) == "Hello there!"
+    )
+
+
+def test_md_format_message_empty():
+    assert md_format_message("", intent=None, entities=[]) == ""
+
+
+def test_md_format_message_using_short_entity_syntax():
+    formatted = md_format_message(
+        "I am from Berlin.",
+        intent="location",
+        entities=[{"start": 10, "end": 16, "entity": "city", "value": "Berlin"}],
+    )
+    assert formatted == """I am from [Berlin](city)."""
+
+
+def test_md_format_message_using_long_entity_syntax():
+    formatted = md_format_message(
+        "I am from Berlin in Germany.",
+        intent="location",
+        entities=[
+            {"start": 10, "end": 16, "entity": "city", "value": "Berlin"},
+            {
+                "start": 20,
+                "end": 27,
+                "entity": "country",
+                "value": "Germany",
+                "role": "destination",
+            },
+        ],
+    )
+    assert (
+        formatted
+        == """I am from [Berlin](city) in [Germany]{"entity": "country", "role": "destination"}."""
+    )
diff --git a/tests/core/test_lock_store.py b/tests/core/test_lock_store.py
index 38b767170aa7..804e04d063b0 100644
--- a/tests/core/test_lock_store.py
+++ b/tests/core/test_lock_store.py
@@ -1,5 +1,6 @@
 import asyncio
 import os
+from pathlib import Path
 
 import numpy as np
 import pytest
@@ -148,15 +149,14 @@ async def test_multiple_conversation_ids(default_agent: Agent):
     assert processed_ids == conversation_ids
 
 
-async def test_message_order(tmpdir_factory: TempdirFactory, default_agent: Agent):
+async def test_message_order(tmp_path: Path, default_agent: Agent):
     start_time = time.time()
     n_messages = 10
     lock_wait = 0.1
 
     # let's write the incoming order of messages and the order of results to temp files
-    temp_path = tmpdir_factory.mktemp("message_order")
-    results_file = temp_path / "results_file"
-    incoming_order_file = temp_path / "incoming_order_file"
+    results_file = tmp_path / "results_file"
+    incoming_order_file = tmp_path / "incoming_order_file"
 
     # We need to mock `Agent.handle_message()` so we can introduce an
     # artificial holdup (`wait_time_in_seconds`). In the mocked method, we'll
diff --git a/tests/core/test_model.py b/tests/core/test_model.py
index 13485fc80974..addcdd5ff70c 100644
--- a/tests/core/test_model.py
+++ b/tests/core/test_model.py
@@ -80,7 +80,7 @@ def test_get_model_exception(model_path):
 
 
 def test_get_model_from_directory_with_subdirectories(
-    trained_rasa_model, tmpdir_factory: TempdirFactory
+    trained_rasa_model: Text, tmp_path: Path
 ):
     unpacked = get_model(trained_rasa_model)
     unpacked_core, unpacked_nlu = get_model_subdirectories(unpacked)
@@ -88,9 +88,8 @@ def test_get_model_from_directory_with_subdirectories(
     assert unpacked_core
     assert unpacked_nlu
 
-    directory = tmpdir_factory.mktemp("empty_model_dir").strpath
     with pytest.raises(ModelNotFound):
-        get_model_subdirectories(directory)
+        get_model_subdirectories(str(tmp_path))  # temp path should be empty
 
 
 def test_get_model_from_directory_nlu_only(trained_rasa_model):
@@ -235,7 +234,9 @@ async def test_create_fingerprint_from_invalid_paths(project, project_files):
 
 
 @pytest.mark.parametrize("use_fingerprint", [True, False])
-async def test_rasa_packaging(trained_rasa_model, project, use_fingerprint):
+async def test_rasa_packaging(
+    trained_rasa_model: Text, project: Text, use_fingerprint: bool, tmp_path: Path
+):
     unpacked_model_path = get_model(trained_rasa_model)
 
     os.remove(os.path.join(unpacked_model_path, FINGERPRINT_FILE_PATH))
@@ -244,8 +245,7 @@ async def test_rasa_packaging(trained_rasa_model, project, use_fingerprint):
     else:
         fingerprint = None
 
-    tempdir = tempfile.mkdtemp()
-    output_path = os.path.join(tempdir, "test.tar.gz")
+    output_path = str(tmp_path / "test.tar.gz")
 
     create_package_rasa(unpacked_model_path, output_path, fingerprint)
 
@@ -314,23 +314,26 @@ async def test_rasa_packaging(trained_rasa_model, project, use_fingerprint):
         },
     ],
 )
-def test_should_retrain(trained_rasa_model: Text, fingerprint: Fingerprint):
-    old_model = set_fingerprint(trained_rasa_model, fingerprint["old"])
+def test_should_retrain(
+    trained_rasa_model: Text, fingerprint: Fingerprint, tmp_path: Path
+):
+    old_model = set_fingerprint(trained_rasa_model, fingerprint["old"], tmp_path)
 
-    retrain = should_retrain(fingerprint["new"], old_model, tempfile.mkdtemp())
+    retrain = should_retrain(fingerprint["new"], old_model, str(tmp_path))
 
     assert retrain.should_retrain_core() == fingerprint["retrain_core"]
     assert retrain.should_retrain_nlg() == fingerprint["retrain_nlg"]
     assert retrain.should_retrain_nlu() == fingerprint["retrain_nlu"]
 
 
-def set_fingerprint(trained_rasa_model: Text, fingerprint: Fingerprint) -> Text:
+def set_fingerprint(
+    trained_rasa_model: Text, fingerprint: Fingerprint, tmp_path: Path
+) -> Text:
     unpacked_model_path = get_model(trained_rasa_model)
 
     os.remove(os.path.join(unpacked_model_path, FINGERPRINT_FILE_PATH))
 
-    tempdir = tempfile.mkdtemp()
-    output_path = os.path.join(tempdir, "test.tar.gz")
+    output_path = str(tmp_path / "test.tar.gz")
 
     create_package_rasa(unpacked_model_path, output_path, fingerprint)
 
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 9232677cbb40..a96dab7da207 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Type
 from unittest.mock import Mock, patch
 
@@ -105,23 +106,25 @@ async def trained_policy(self, featurizer, priority):
         policy.train(training_trackers, default_domain, RegexInterpreter())
         return policy
 
-    def test_featurizer(self, trained_policy, tmpdir):
+    def test_featurizer(self, trained_policy: Policy, tmp_path: Path):
         assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
         assert trained_policy.featurizer.max_history == self.max_history
         assert isinstance(
             trained_policy.featurizer.state_featurizer, BinarySingleStateFeaturizer
         )
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        trained_policy.persist(str(tmp_path))
+        loaded = trained_policy.__class__.load(str(tmp_path))
         assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
         assert loaded.featurizer.max_history == self.max_history
         assert isinstance(
             loaded.featurizer.state_featurizer, BinarySingleStateFeaturizer
         )
 
-    async def test_persist_and_load(self, trained_policy, default_domain, tmpdir):
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
+    async def test_persist_and_load(
+        self, trained_policy: Policy, default_domain: Domain, tmp_path: Path
+    ):
+        trained_policy.persist(str(tmp_path))
+        loaded = trained_policy.__class__.load(str(tmp_path))
         trackers = await train_trackers(default_domain, augmentation_factor=20)
 
         for tracker in trackers:
@@ -147,10 +150,10 @@ def test_prediction_on_empty_tracker(self, trained_policy, default_domain):
     @pytest.mark.filterwarnings(
         "ignore:.*without a trained model present.*:UserWarning"
     )
-    def test_persist_and_load_empty_policy(self, tmpdir):
+    def test_persist_and_load_empty_policy(self, tmp_path: Path):
         empty_policy = self.create_policy(None, None)
-        empty_policy.persist(tmpdir.strpath)
-        loaded = empty_policy.__class__.load(tmpdir.strpath)
+        empty_policy.persist(str(tmp_path))
+        loaded = empty_policy.__class__.load(str(tmp_path))
         assert loaded is not None
 
     @staticmethod
@@ -418,14 +421,14 @@ def create_policy(self, featurizer, priority):
         p = TEDPolicy(priority=priority)
         return p
 
-    def test_featurizer(self, trained_policy, tmpdir):
+    def test_featurizer(self, trained_policy: Policy, tmp_path: Path):
         assert isinstance(trained_policy.featurizer, FullDialogueTrackerFeaturizer)
         assert isinstance(
             trained_policy.featurizer.state_featurizer,
             LabelTokenizerSingleStateFeaturizer,
         )
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        trained_policy.persist(str(tmp_path))
+        loaded = trained_policy.__class__.load(str(tmp_path))
         assert isinstance(loaded.featurizer, FullDialogueTrackerFeaturizer)
         assert isinstance(
             loaded.featurizer.state_featurizer, LabelTokenizerSingleStateFeaturizer
@@ -440,15 +443,15 @@ def create_policy(self, featurizer, priority):
         p = TEDPolicy(priority=priority, max_history=self.max_history)
         return p
 
-    def test_featurizer(self, trained_policy, tmpdir):
+    def test_featurizer(self, trained_policy: Policy, tmp_path: Path):
         assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
         assert trained_policy.featurizer.max_history == self.max_history
         assert isinstance(
             trained_policy.featurizer.state_featurizer,
             LabelTokenizerSingleStateFeaturizer,
         )
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        trained_policy.persist(str(tmp_path))
+        loaded = trained_policy.__class__.load(str(tmp_path))
         assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
         assert loaded.featurizer.max_history == self.max_history
         assert isinstance(
@@ -495,15 +498,17 @@ def create_policy(self, featurizer, priority):
         p = MemoizationPolicy(priority=priority, max_history=max_history)
         return p
 
-    def test_featurizer(self, trained_policy, tmpdir):
+    def test_featurizer(self, trained_policy: Policy, tmp_path: Path):
         assert isinstance(trained_policy.featurizer, MaxHistoryTrackerFeaturizer)
         assert trained_policy.featurizer.state_featurizer is None
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        trained_policy.persist(str(tmp_path))
+        loaded = trained_policy.__class__.load(str(tmp_path))
         assert isinstance(loaded.featurizer, MaxHistoryTrackerFeaturizer)
         assert loaded.featurizer.state_featurizer is None
 
-    async def test_memorise(self, trained_policy, default_domain):
+    async def test_memorise(
+        self, trained_policy: MemoizationPolicy, default_domain: Domain
+    ):
         trackers = await train_trackers(default_domain, augmentation_factor=20)
         trained_policy.train(trackers, default_domain, RegexInterpreter())
         lookup_with_augmentation = trained_policy.lookup
@@ -639,10 +644,10 @@ def create_policy(self, featurizer, priority):
         p = MappingPolicy()
         return p
 
-    def test_featurizer(self, trained_policy, tmpdir):
+    def test_featurizer(self, trained_policy: Policy, tmp_path: Path):
         assert trained_policy.featurizer is None
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        trained_policy.persist(str(tmp_path))
+        loaded = trained_policy.__class__.load(str(tmp_path))
         assert loaded.featurizer is None
 
     @pytest.fixture(scope="module")
@@ -720,10 +725,10 @@ def create_policy(self, featurizer, priority):
         p = FallbackPolicy(priority=priority)
         return p
 
-    def test_featurizer(self, trained_policy, tmpdir):
+    def test_featurizer(self, trained_policy: Policy, tmp_path: Path):
         assert trained_policy.featurizer is None
-        trained_policy.persist(tmpdir.strpath)
-        loaded = trained_policy.__class__.load(tmpdir.strpath)
+        trained_policy.persist(str(tmp_path))
+        loaded = trained_policy.__class__.load(str(tmp_path))
         assert loaded.featurizer is None
 
     @pytest.mark.parametrize(
diff --git a/tests/core/test_processor.py b/tests/core/test_processor.py
index 92686727b1a6..77a60c79326d 100644
--- a/tests/core/test_processor.py
+++ b/tests/core/test_processor.py
@@ -69,14 +69,14 @@ async def test_message_id_logging(default_processor: MessageProcessor):
 
 async def test_parsing(default_processor: MessageProcessor):
     message = UserMessage('/greet{"name": "boy"}')
-    parsed = await default_processor._parse_message(message)
+    parsed = await default_processor.parse_message(message)
     assert parsed["intent"][INTENT_NAME_KEY] == "greet"
     assert parsed["entities"][0]["entity"] == "name"
 
 
 async def test_check_for_unseen_feature(default_processor: MessageProcessor):
     message = UserMessage('/dislike{"test_entity": "RASA"}')
-    parsed = await default_processor._parse_message(message)
+    parsed = await default_processor.parse_message(message)
     with pytest.warns(UserWarning) as record:
         default_processor._check_for_unseen_features(parsed)
     assert len(record) == 2
@@ -96,7 +96,7 @@ async def test_default_intent_recognized(
     default_processor: MessageProcessor, default_intent: Text
 ):
     message = UserMessage(default_intent)
-    parsed = await default_processor._parse_message(message)
+    parsed = await default_processor.parse_message(message)
     with pytest.warns(None) as record:
         default_processor._check_for_unseen_features(parsed)
     assert len(record) == 0
@@ -111,9 +111,7 @@ async def test_http_parsing():
 
         inter = RasaNLUHttpInterpreter(endpoint_config=endpoint)
         try:
-            await MessageProcessor(inter, None, None, None, None)._parse_message(
-                message
-            )
+            await MessageProcessor(inter, None, None, None, None).parse_message(message)
         except KeyError:
             pass  # logger looks for intent and entities, so we except
 
diff --git a/tests/core/test_trackers.py b/tests/core/test_trackers.py
index 78888eec0745..99e1db5095ec 100644
--- a/tests/core/test_trackers.py
+++ b/tests/core/test_trackers.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import os
+from pathlib import Path
 import tempfile
 from typing import List, Text, Dict, Any, Type
 
@@ -129,14 +130,14 @@ def test_tracker_store(store, pair):
     assert restored == tracker
 
 
-async def test_tracker_write_to_story(tmpdir, moodbot_domain: Domain):
+async def test_tracker_write_to_story(tmp_path: Path, moodbot_domain: Domain):
     tracker = tracker_from_dialogue_file(
         "data/test_dialogues/moodbot.json", moodbot_domain
     )
-    p = tmpdir.join("export.md")
-    tracker.export_stories_to_file(p.strpath)
+    p = tmp_path / "export.md"
+    tracker.export_stories_to_file(str(p))
     trackers = await training.load_data(
-        p.strpath,
+        str(p),
         moodbot_domain,
         use_story_concatenation=False,
         tracker_limit=1000,
@@ -465,17 +466,17 @@ def test_traveling_back_in_time(default_domain: Domain):
     assert len(list(tracker.generate_all_prior_trackers())) == 2
 
 
-async def test_dump_and_restore_as_json(default_agent, tmpdir_factory):
+async def test_dump_and_restore_as_json(default_agent: Agent, tmp_path: Path):
     trackers = await default_agent.load_data(DEFAULT_STORIES_FILE)
 
     for tracker in trackers:
-        out_path = tmpdir_factory.mktemp("tracker").join("dumped_tracker.json")
+        out_path = tmp_path / "dumped_tracker.json"
 
         dumped = tracker.current_state(EventVerbosity.AFTER_RESTART)
-        rasa.utils.io.dump_obj_as_json_to_file(out_path.strpath, dumped)
+        rasa.utils.io.dump_obj_as_json_to_file(str(out_path), dumped)
 
         restored_tracker = restore.load_tracker_from_json(
-            out_path.strpath, default_agent.domain
+            str(out_path), default_agent.domain
         )
 
         assert restored_tracker == tracker
diff --git a/tests/core/test_training.py b/tests/core/test_training.py
index 08284c82e374..0e18a0910962 100644
--- a/tests/core/test_training.py
+++ b/tests/core/test_training.py
@@ -25,7 +25,7 @@ async def test_story_visualization(
     import rasa.core.training.loading as core_loading
 
     story_steps = await core_loading.load_data_from_resource(
-        "data/test_stories/stories.md", default_domain, interpreter=RegexInterpreter()
+        "data/test_stories/stories.md", default_domain
     )
     out_file = str(tmp_path / "graph.html")
     generated_graph = await visualize_stories(
@@ -51,7 +51,7 @@ async def test_story_visualization_with_merging(
     import rasa.core.training.loading as core_loading
 
     story_steps = await core_loading.load_data_from_resource(
-        stories_file, default_domain, interpreter=RegexInterpreter()
+        stories_file, default_domain
     )
     generated_graph = await visualize_stories(
         story_steps,
diff --git a/tests/core/test_visualization.py b/tests/core/test_visualization.py
index 6ad408f51786..3d30b0635014 100644
--- a/tests/core/test_visualization.py
+++ b/tests/core/test_visualization.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Text
 
 import pytest
@@ -82,16 +83,17 @@ def test_common_action_prefix_unequal():
     "stories_file",
     ["data/test_stories/stories.md", "data/test_yaml_stories/stories.yml"],
 )
-async def test_graph_persistence(stories_file: Text, default_domain: Domain, tmpdir):
+async def test_graph_persistence(
+    stories_file: Text, default_domain: Domain, tmp_path: Path
+):
     from os.path import isfile
     from networkx.drawing import nx_pydot
-    from rasa.core.interpreter import RegexInterpreter
     import rasa.core.training.loading as core_loading
 
     story_steps = await core_loading.load_data_from_resource(
-        stories_file, default_domain, RegexInterpreter()
+        stories_file, default_domain
     )
-    out_file = tmpdir.join("graph.html").strpath
+    out_file = str(tmp_path / "graph.html")
     generated_graph = await visualization.visualize_stories(
         story_steps,
         default_domain,
@@ -114,15 +116,14 @@ async def test_graph_persistence(stories_file: Text, default_domain: Domain, tmp
     "stories_file",
     ["data/test_stories/stories.md", "data/test_yaml_stories/stories.yml"],
 )
-async def test_merge_nodes(stories_file: Text, default_domain: Domain, tmpdir):
+async def test_merge_nodes(stories_file: Text, default_domain: Domain, tmp_path: Path):
     from os.path import isfile
-    from rasa.core.interpreter import RegexInterpreter
     import rasa.core.training.loading as core_loading
 
     story_steps = await core_loading.load_data_from_resource(
-        stories_file, default_domain, RegexInterpreter()
+        stories_file, default_domain
     )
-    out_file = tmpdir.join("graph.html").strpath
+    out_file = str(tmp_path / "graph.html")
     await visualization.visualize_stories(
         story_steps,
         default_domain,
diff --git a/tests/core/training/story_reader/test_common_story_reader.py b/tests/core/training/story_reader/test_common_story_reader.py
index f1d86cb5fde6..c9339be1e24c 100644
--- a/tests/core/training/story_reader/test_common_story_reader.py
+++ b/tests/core/training/story_reader/test_common_story_reader.py
@@ -173,13 +173,13 @@ async def test_generate_training_data_original_and_augmented_trackers(
     ],
 )
 async def test_visualize_training_data_graph(
-    stories_file: Text, tmpdir, default_domain: Domain
+    stories_file: Text, tmp_path: Path, default_domain: Domain
 ):
     graph = await training.extract_story_graph(stories_file, default_domain)
 
     graph = graph.with_cycles_removed()
 
-    out_path = tmpdir.join("graph.html").strpath
+    out_path = str(tmp_path / "graph.html")
 
     # this will be the plotted networkx graph
     G = graph.visualize(out_path)
diff --git a/tests/core/training/story_reader/test_markdown_story_reader.py b/tests/core/training/story_reader/test_markdown_story_reader.py
index 125f2f1c3482..90d0f63d8a32 100644
--- a/tests/core/training/story_reader/test_markdown_story_reader.py
+++ b/tests/core/training/story_reader/test_markdown_story_reader.py
@@ -1,3 +1,8 @@
+from pathlib import Path
+from typing import Dict, Text
+
+import pytest
+
 import rasa.utils.io
 from rasa.core import training
 from rasa.core.domain import Domain
@@ -10,22 +15,23 @@
     SlotSet,
     LegacyForm,
 )
-from rasa.core.interpreter import RegexInterpreter
 from rasa.core.trackers import DialogueStateTracker
 from rasa.core.training import loading
 from rasa.core.training.story_reader.markdown_story_reader import MarkdownStoryReader
 from rasa.core.training.structures import Story
 
 
-async def test_persist_and_read_test_story_graph(tmpdir, default_domain: Domain):
+async def test_persist_and_read_test_story_graph(
+    tmp_path: Path, default_domain: Domain
+):
     graph = await training.extract_story_graph(
         "data/test_stories/stories.md", default_domain
     )
-    out_path = tmpdir.join("persisted_story.md")
-    rasa.utils.io.write_text_file(graph.as_story_string(), out_path.strpath)
+    out_path = tmp_path / "persisted_story.md"
+    rasa.utils.io.write_text_file(graph.as_story_string(), str(out_path))
 
     recovered_trackers = await training.load_data(
-        out_path.strpath,
+        str(out_path),
         default_domain,
         use_story_concatenation=False,
         tracker_limit=1000,
@@ -46,15 +52,15 @@ async def test_persist_and_read_test_story_graph(tmpdir, default_domain: Domain)
         existing_stories.discard(story_str)
 
 
-async def test_persist_and_read_test_story(tmpdir, default_domain: Domain):
+async def test_persist_and_read_test_story(tmp_path: Path, default_domain: Domain):
     graph = await training.extract_story_graph(
         "data/test_stories/stories.md", default_domain
     )
-    out_path = tmpdir.join("persisted_story.md")
-    Story(graph.story_steps).dump_to_file(out_path.strpath)
+    out_path = tmp_path / "persisted_story.md"
+    Story(graph.story_steps).dump_to_file(str(out_path))
 
     recovered_trackers = await training.load_data(
-        out_path.strpath,
+        str(out_path),
         default_domain,
         use_story_concatenation=False,
         tracker_limit=1000,
@@ -223,7 +229,7 @@ async def test_persist_form_story():
 
 
 async def test_read_stories_with_multiline_comments(tmpdir, default_domain: Domain):
-    reader = MarkdownStoryReader(RegexInterpreter(), default_domain)
+    reader = MarkdownStoryReader(default_domain)
 
     story_steps = await reader.read_from_file(
         "data/test_stories/stories_with_multiline_comments.md"
@@ -242,7 +248,7 @@ async def test_read_stories_with_multiline_comments(tmpdir, default_domain: Doma
 
 async def test_read_stories_with_rules(default_domain: Domain):
     story_steps = await loading.load_data_from_files(
-        ["data/test_stories/stories_with_rules.md"], default_domain, RegexInterpreter()
+        ["data/test_stories/stories_with_rules.md"], default_domain
     )
 
     # this file contains three rules and two ML stories
@@ -263,9 +269,7 @@ async def test_read_stories_with_rules(default_domain: Domain):
 
 async def test_read_rules_without_stories(default_domain: Domain):
     story_steps = await loading.load_data_from_files(
-        ["data/test_stories/rules_without_stories.md"],
-        default_domain,
-        RegexInterpreter(),
+        ["data/test_stories/rules_without_stories.md"], default_domain,
     )
 
     # this file contains three rules and two ML stories
@@ -295,3 +299,111 @@ async def test_read_rules_without_stories(default_domain: Domain):
         [{"entity": "some_slot", "start": 6, "end": 25, "value": "bla"}],
     )
     assert events[4] == ActionExecuted("loop_q_form")
+
+
+@pytest.mark.parametrize(
+    "line, expected",
+    [
+        (" greet: hi", {"intent": "greet", "text": "hi"}),
+        (" greet: /greet", {"intent": "greet", "text": "/greet", "entities": [],},),
+        (
+            'greet: /greet{"test": "test"}',
+            {
+                "intent": "greet",
+                "entities": [
+                    {"entity": "test", "start": 6, "end": 22, "value": "test"}
+                ],
+                "text": '/greet{"test": "test"}',
+            },
+        ),
+        (
+            'greet{"test": "test"}: /greet{"test": "test"}',
+            {
+                "intent": "greet",
+                "entities": [
+                    {"entity": "test", "start": 6, "end": 22, "value": "test"}
+                ],
+                "text": '/greet{"test": "test"}',
+            },
+        ),
+        (
+            "mood_great: [great](feeling)",
+            {
+                "intent": "mood_great",
+                "entities": [
+                    {"start": 0, "end": 5, "value": "great", "entity": "feeling"}
+                ],
+                "text": "great",
+            },
+        ),
+        (
+            'form: greet{"test": "test"}: /greet{"test": "test"}',
+            {
+                "intent": "greet",
+                "entities": [
+                    {"end": 22, "entity": "test", "start": 6, "value": "test"}
+                ],
+                "text": '/greet{"test": "test"}',
+            },
+        ),
+    ],
+)
+def test_e2e_parsing(line: Text, expected: Dict):
+    actual = MarkdownStoryReader.parse_e2e_message(line)
+
+    assert actual.as_dict() == expected
+
+
+@pytest.mark.parametrize(
+    "parse_data, expected_story_string",
+    [
+        (
+            {
+                "text": "/simple",
+                "parse_data": {
+                    "intent": {"confidence": 1.0, "name": "simple"},
+                    "entities": [
+                        {"start": 0, "end": 5, "value": "great", "entity": "feeling"}
+                    ],
+                },
+            },
+            "simple: /simple",
+        ),
+        (
+            {
+                "text": "great",
+                "parse_data": {
+                    "intent": {"confidence": 1.0, "name": "simple"},
+                    "entities": [
+                        {"start": 0, "end": 5, "value": "great", "entity": "feeling"}
+                    ],
+                },
+            },
+            "simple: [great](feeling)",
+        ),
+        (
+            {
+                "text": "great",
+                "parse_data": {
+                    "intent": {"confidence": 1.0, "name": "simple"},
+                    "entities": [],
+                },
+            },
+            "simple: great",
+        ),
+    ],
+)
+def test_user_uttered_to_e2e(parse_data: Dict, expected_story_string: Text):
+    event = UserUttered.from_story_string("user", parse_data)[0]
+
+    assert isinstance(event, UserUttered)
+    assert event.as_story_string(e2e=True) == expected_story_string
+
+
+@pytest.mark.parametrize("line", [" greet{: hi"])
+def test_invalid_end_to_end_format(line: Text):
+    reader = MarkdownStoryReader()
+
+    with pytest.raises(ValueError):
+        # noinspection PyProtectedMember
+        _ = reader.parse_e2e_message(line)
diff --git a/tests/core/training/story_reader/test_yaml_story_reader.py b/tests/core/training/story_reader/test_yaml_story_reader.py
index 175b11dd9462..e142e26d460c 100644
--- a/tests/core/training/story_reader/test_yaml_story_reader.py
+++ b/tests/core/training/story_reader/test_yaml_story_reader.py
@@ -1,26 +1,24 @@
+from pathlib import Path
 from typing import Text, List
 
 import pytest
 
+import rasa.utils.io
 from rasa.constants import LATEST_TRAINING_DATA_FORMAT_VERSION
 from rasa.core import training
 from rasa.core.actions.action import RULE_SNIPPET_ACTION_NAME
 from rasa.core.domain import Domain
 from rasa.core.training import loading
 from rasa.core.events import ActionExecuted, UserUttered, SlotSet, ActiveLoop
-from rasa.core.interpreter import RegexInterpreter
 from rasa.core.training.story_reader.yaml_story_reader import YAMLStoryReader
 from rasa.core.training.structures import StoryStep
-from rasa.utils import io as io_utils
 
 
 @pytest.fixture()
 async def rule_steps_without_stories(default_domain: Domain) -> List[StoryStep]:
     yaml_file = "data/test_yaml_stories/rules_without_stories.yml"
 
-    return await loading.load_data_from_files(
-        [yaml_file], default_domain, RegexInterpreter()
-    )
+    return await loading.load_data_from_files([yaml_file], default_domain)
 
 
 async def test_can_read_test_story_with_slots(default_domain: Domain):
@@ -162,9 +160,7 @@ async def test_read_rules_with_stories(default_domain: Domain):
 
     yaml_file = "data/test_yaml_stories/stories_and_rules.yml"
 
-    steps = await loading.load_data_from_files(
-        [yaml_file], default_domain, RegexInterpreter()
-    )
+    steps = await loading.load_data_from_files([yaml_file], default_domain)
 
     ml_steps = [s for s in steps if not s.is_rule]
     rule_steps = [s for s in steps if s.is_rule]
@@ -260,8 +256,8 @@ async def test_warning_if_intent_not_in_domain(default_domain: Domain):
       - intent: definitely not in domain
     """
 
-    reader = YAMLStoryReader(RegexInterpreter(), default_domain)
-    yaml_content = io_utils.read_yaml(stories)
+    reader = YAMLStoryReader(default_domain)
+    yaml_content = rasa.utils.io.read_yaml(stories)
 
     with pytest.warns(UserWarning) as record:
         reader.read_from_parsed_yaml(yaml_content)
@@ -279,8 +275,8 @@ async def test_no_warning_if_intent_in_domain(default_domain: Domain):
         f"  - intent: greet"
     )
 
-    reader = YAMLStoryReader(RegexInterpreter(), default_domain)
-    yaml_content = io_utils.read_yaml(stories)
+    reader = YAMLStoryReader(default_domain)
+    yaml_content = rasa.utils.io.read_yaml(stories)
 
     with pytest.warns(None) as record:
         reader.read_from_parsed_yaml(yaml_content)
@@ -298,10 +294,33 @@ async def test_active_loop_is_parsed(default_domain: Domain):
         f"  - active_loop: null"
     )
 
-    reader = YAMLStoryReader(RegexInterpreter(), default_domain)
-    yaml_content = io_utils.read_yaml(stories)
+    reader = YAMLStoryReader(default_domain)
+    yaml_content = rasa.utils.io.read_yaml(stories)
 
     with pytest.warns(None) as record:
         reader.read_from_parsed_yaml(yaml_content)
 
     assert not len(record)
+
+
+def test_is_test_story_file(tmp_path: Path):
+    path = str(tmp_path / "test_stories.yml")
+    rasa.utils.io.write_yaml({"stories": []}, path)
+    assert YAMLStoryReader.is_yaml_test_stories_file(path)
+
+
+def test_is_not_test_story_file_if_it_doesnt_contain_stories(tmp_path: Path):
+    path = str(tmp_path / "test_stories.yml")
+    rasa.utils.io.write_yaml({"nlu": []}, path)
+    assert not YAMLStoryReader.is_yaml_test_stories_file(path)
+
+
+def test_is_not_test_story_file_if_empty(tmp_path: Path):
+    path = str(tmp_path / "test_stories.yml")
+    assert not YAMLStoryReader.is_yaml_test_stories_file(path)
+
+
+def test_is_not_test_story_file_without_test_prefix(tmp_path: Path):
+    path = str(tmp_path / "stories.yml")
+    rasa.utils.io.write_yaml({"stories": []}, path)
+    assert not YAMLStoryReader.is_yaml_test_stories_file(path)
diff --git a/tests/core/training/story_writer/test_yaml_story_writer.py b/tests/core/training/story_writer/test_yaml_story_writer.py
index 7772e525c7cd..918f318a3389 100644
--- a/tests/core/training/story_writer/test_yaml_story_writer.py
+++ b/tests/core/training/story_writer/test_yaml_story_writer.py
@@ -1,10 +1,12 @@
 from pathlib import Path
+import textwrap
 from typing import Text
 
 import pytest
 
 from rasa.core.domain import Domain
-from rasa.core.interpreter import RegexInterpreter
+from rasa.core.events import ActionExecuted, UserUttered
+from rasa.core.trackers import DialogueStateTracker
 from rasa.core.training.story_reader.markdown_story_reader import MarkdownStoryReader
 from rasa.core.training.story_reader.yaml_story_reader import YAMLStoryReader
 from rasa.core.training.story_writer.yaml_story_writer import YAMLStoryWriter
@@ -25,18 +27,11 @@ async def test_simple_story(
 ):
 
     original_md_reader = MarkdownStoryReader(
-        RegexInterpreter(),
-        default_domain,
-        None,
-        False,
-        input_yaml_file,
-        unfold_or_utterances=False,
+        default_domain, None, False, input_yaml_file, unfold_or_utterances=False,
     )
     original_md_story_steps = await original_md_reader.read_from_file(input_md_file)
 
-    original_yaml_reader = YAMLStoryReader(
-        RegexInterpreter(), default_domain, None, False
-    )
+    original_yaml_reader = YAMLStoryReader(default_domain, None, False)
     original_yaml_story_steps = await original_yaml_reader.read_from_file(
         input_yaml_file
     )
@@ -45,9 +40,7 @@ async def test_simple_story(
     writer = YAMLStoryWriter()
     writer.dump(target_story_filename, original_md_story_steps)
 
-    processed_yaml_reader = YAMLStoryReader(
-        RegexInterpreter(), default_domain, None, False
-    )
+    processed_yaml_reader = YAMLStoryReader(default_domain, None, False)
     processed_yaml_story_steps = await processed_yaml_reader.read_from_file(
         target_story_filename
     )
@@ -61,7 +54,7 @@ async def test_simple_story(
 
 async def test_forms_are_skipped_with_warning(default_domain: Domain):
     original_md_reader = MarkdownStoryReader(
-        RegexInterpreter(), default_domain, None, False, unfold_or_utterances=False,
+        default_domain, None, False, unfold_or_utterances=False,
     )
     original_md_story_steps = await original_md_reader.read_from_file(
         "data/test_stories/stories_form.md"
@@ -74,3 +67,53 @@ async def test_forms_are_skipped_with_warning(default_domain: Domain):
 
     # We skip 5 stories with the forms and warn users
     assert len(record) == 5
+
+
+def test_yaml_writer_dumps_user_messages():
+    events = [
+        UserUttered("Hello", {"name": "greet"}),
+        ActionExecuted("utter_greet"),
+    ]
+    tracker = DialogueStateTracker.from_events("default", events)
+    dump = YAMLStoryWriter().dumps(tracker.as_story().story_steps)
+
+    assert (
+        dump.strip()
+        == textwrap.dedent(
+            """
+        version: "2.0"
+        stories:
+        - story: default
+          steps:
+          - intent: greet
+            user: |-
+              Hello
+          - action: utter_greet
+
+    """
+        ).strip()
+    )
+
+
+def test_yaml_writer_avoids_dumping_not_existing_user_messages():
+    events = [
+        UserUttered("greet", {"name": "greet"}),
+        ActionExecuted("utter_greet"),
+    ]
+    tracker = DialogueStateTracker.from_events("default", events)
+    dump = YAMLStoryWriter().dumps(tracker.as_story().story_steps)
+
+    assert (
+        dump.strip()
+        == textwrap.dedent(
+            """
+        version: "2.0"
+        stories:
+        - story: default
+          steps:
+          - intent: greet
+          - action: utter_greet
+
+    """
+        ).strip()
+    )
diff --git a/tests/examples/test_example_bots_training_data.py b/tests/examples/test_example_bots_training_data.py
index fee0156c0358..4ba07428ab2d 100644
--- a/tests/examples/test_example_bots_training_data.py
+++ b/tests/examples/test_example_bots_training_data.py
@@ -1,7 +1,9 @@
+from pathlib import Path
 from typing import Text
 
 import pytest
 
+from rasa.cli import scaffold
 from rasa.importers.importer import TrainingDataImporter
 
 
@@ -33,11 +35,6 @@
             "examples/rules/domain.yml",
             "examples/rules/data",
         ),
-        (
-            "rasa/cli/initial_project/config.yml",
-            "rasa/cli/initial_project/domain.yml",
-            "rasa/cli/initial_project/data",
-        ),
     ],
 )
 async def test_example_bot_training_data_not_raises(
@@ -53,3 +50,21 @@ async def test_example_bot_training_data_not_raises(
         await importer.get_stories()
 
     assert not len(record)
+
+
+async def test_example_bot_training_on_initial_project(tmp_path: Path):
+    # we need to test this one separately, as we can't test it in place
+    # configuration suggestions would otherwise change the initial file
+    scaffold.create_initial_project(str(tmp_path))
+
+    importer = TrainingDataImporter.load_from_config(
+        str(tmp_path / "config.yml"),
+        str(tmp_path / "domain.yml"),
+        str(tmp_path / "data"),
+    )
+
+    with pytest.warns(None) as record:
+        await importer.get_nlu_data()
+        await importer.get_stories()
+
+    assert not len(record)
diff --git a/tests/importers/test_multi_project.py b/tests/importers/test_multi_project.py
index bff2d51b6813..e79a43e92af9 100644
--- a/tests/importers/test_multi_project.py
+++ b/tests/importers/test_multi_project.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Dict, Text
 
 import pytest
@@ -10,25 +11,25 @@
     DEFAULT_E2E_TESTS_PATH,
 )
 from rasa.nlu.training_data.formats import RasaReader
+import rasa.utils.io
 from rasa import model
 from rasa.core import utils
 from rasa.core.domain import Domain
 from rasa.importers.multi_project import MultiProjectImporter
 
 
-def test_load_imports_from_directory_tree(tmpdir_factory: TempdirFactory):
-    root = tmpdir_factory.mktemp("Parent Bot")
+def test_load_imports_from_directory_tree(tmp_path: Path):
     root_imports = {"imports": ["Project A"]}
-    utils.dump_obj_as_yaml_to_file(root / "config.yml", root_imports)
+    utils.dump_obj_as_yaml_to_file(tmp_path / "config.yml", root_imports)
 
-    project_a_directory = root / "Project A"
+    project_a_directory = tmp_path / "Project A"
     project_a_directory.mkdir()
     project_a_imports = {"imports": ["../Project B"]}
     utils.dump_obj_as_yaml_to_file(
         project_a_directory / "config.yml", project_a_imports
     )
 
-    project_b_directory = root / "Project B"
+    project_b_directory = tmp_path / "Project B"
     project_b_directory.mkdir()
     project_b_imports = {"some other": ["../Project C"]}
     utils.dump_obj_as_yaml_to_file(
@@ -44,7 +45,7 @@ def test_load_imports_from_directory_tree(tmpdir_factory: TempdirFactory):
     )
 
     # should not be imported
-    subdirectory_3 = root / "Project C"
+    subdirectory_3 = tmp_path / "Project C"
     subdirectory_3.mkdir()
 
     expected = [
@@ -52,48 +53,43 @@ def test_load_imports_from_directory_tree(tmpdir_factory: TempdirFactory):
         os.path.join(str(project_b_directory)),
     ]
 
-    actual = MultiProjectImporter(str(root / "config.yml"))
+    actual = MultiProjectImporter(str(tmp_path / "config.yml"))
 
     assert actual._imports == expected
 
 
-def test_load_imports_without_imports(tmpdir_factory: TempdirFactory):
+def test_load_imports_without_imports(tmp_path: Path):
     empty_config = {}
-    root = tmpdir_factory.mktemp("Parent Bot")
-    utils.dump_obj_as_yaml_to_file(root / "config.yml", empty_config)
+    utils.dump_obj_as_yaml_to_file(tmp_path / "config.yml", empty_config)
 
-    project_a_directory = root / "Project A"
+    project_a_directory = tmp_path / "Project A"
     project_a_directory.mkdir()
     utils.dump_obj_as_yaml_to_file(project_a_directory / "config.yml", empty_config)
 
-    project_b_directory = root / "Project B"
+    project_b_directory = tmp_path / "Project B"
     project_b_directory.mkdir()
     utils.dump_obj_as_yaml_to_file(project_b_directory / "config.yml", empty_config)
 
-    actual = MultiProjectImporter(str(root / "config.yml"))
+    actual = MultiProjectImporter(str(tmp_path / "config.yml"))
 
-    assert actual.is_imported(str(root / "Project C"))
+    assert actual.is_imported(str(tmp_path / "Project C"))
 
 
 @pytest.mark.parametrize("input_dict", [{}, {"imports": None}])
-def test_load_from_none(input_dict: Dict, tmpdir_factory: TempdirFactory):
-    root = tmpdir_factory.mktemp("Parent Bot")
-    config_path = root / "config.yml"
-    utils.dump_obj_as_yaml_to_file(root / "config.yml", input_dict)
+def test_load_from_none(input_dict: Dict, tmp_path: Path):
+    config_path = tmp_path / "config.yml"
+    utils.dump_obj_as_yaml_to_file(tmp_path / "config.yml", input_dict)
 
     actual = MultiProjectImporter(str(config_path))
 
     assert actual._imports == list()
 
 
-def test_load_if_subproject_is_more_specific_than_parent(
-    tmpdir_factory: TempdirFactory,
-):
-    root = tmpdir_factory.mktemp("Parent Bot")
-    config_path = str(root / "config.yml")
-    utils.dump_obj_as_yaml_to_file(root / "config.yml", {})
+def test_load_if_subproject_is_more_specific_than_parent(tmp_path: Path,):
+    config_path = str(tmp_path / "config.yml")
+    utils.dump_obj_as_yaml_to_file(tmp_path / "config.yml", {})
 
-    project_a_directory = root / "Project A"
+    project_a_directory = tmp_path / "Project A"
     project_a_directory.mkdir()
     project_a_imports = {"imports": ["Project B"]}
     utils.dump_obj_as_yaml_to_file(
@@ -108,10 +104,11 @@ def test_load_if_subproject_is_more_specific_than_parent(
 @pytest.mark.parametrize(
     "input_path", ["A/A/A/B", "A/A/A", "A/B/A/A", "A/A/A/B/C/D/E.type"]
 )
-def test_in_imports(input_path: Text, tmpdir_factory: TempdirFactory):
-    root = tmpdir_factory.mktemp("Parent Bot")
-    config_path = str(root / "config.yml")
-    utils.dump_obj_as_yaml_to_file(root / "config.yml", {"imports": ["A/A/A", "A/B/A"]})
+def test_in_imports(input_path: Text, tmp_path: Path):
+    config_path = str(tmp_path / "config.yml")
+    utils.dump_obj_as_yaml_to_file(
+        tmp_path / "config.yml", {"imports": ["A/A/A", "A/B/A"]}
+    )
 
     importer = MultiProjectImporter(config_path, project_directory=os.getcwd())
 
@@ -119,52 +116,51 @@ def test_in_imports(input_path: Text, tmpdir_factory: TempdirFactory):
 
 
 @pytest.mark.parametrize("input_path", ["A/C", "A/A/B", "A/B"])
-def test_not_in_imports(input_path: Text, tmpdir_factory: TempdirFactory):
-    root = tmpdir_factory.mktemp("Parent Bot")
-    config_path = str(root / "config.yml")
-    utils.dump_obj_as_yaml_to_file(root / "config.yml", {"imports": ["A/A/A", "A/B/A"]})
+def test_not_in_imports(input_path: Text, tmp_path: Path):
+    config_path = str(tmp_path / "config.yml")
+    utils.dump_obj_as_yaml_to_file(
+        tmp_path / "config.yml", {"imports": ["A/A/A", "A/B/A"]}
+    )
     importer = MultiProjectImporter(config_path, project_directory=os.getcwd())
 
     assert not importer.is_imported(input_path)
 
 
-def test_cyclic_imports(tmpdir_factory):
-    root = tmpdir_factory.mktemp("Parent Bot")
+def test_cyclic_imports(tmp_path: Path):
     project_imports = {"imports": ["Project A"]}
-    utils.dump_obj_as_yaml_to_file(root / "config.yml", project_imports)
+    utils.dump_obj_as_yaml_to_file(tmp_path / "config.yml", project_imports)
 
-    project_a_directory = root / "Project A"
+    project_a_directory = tmp_path / "Project A"
     project_a_directory.mkdir()
     project_a_imports = {"imports": ["../Project B"]}
     utils.dump_obj_as_yaml_to_file(
         project_a_directory / "config.yml", project_a_imports
     )
 
-    project_b_directory = root / "Project B"
+    project_b_directory = tmp_path / "Project B"
     project_b_directory.mkdir()
     project_b_imports = {"imports": ["../Project A"]}
     utils.dump_obj_as_yaml_to_file(
         project_b_directory / "config.yml", project_b_imports
     )
 
-    actual = MultiProjectImporter(str(root / "config.yml"))
+    actual = MultiProjectImporter(str(tmp_path / "config.yml"))
 
     assert actual._imports == [str(project_a_directory), str(project_b_directory)]
 
 
-def test_import_outside_project_directory(tmpdir_factory):
-    root = tmpdir_factory.mktemp("Parent Bot")
+def test_import_outside_project_directory(tmp_path: Path):
     project_imports = {"imports": ["Project A"]}
-    utils.dump_obj_as_yaml_to_file(root / "config.yml", project_imports)
+    utils.dump_obj_as_yaml_to_file(tmp_path / "config.yml", project_imports)
 
-    project_a_directory = root / "Project A"
+    project_a_directory = tmp_path / "Project A"
     project_a_directory.mkdir()
     project_a_imports = {"imports": ["../Project B"]}
     utils.dump_obj_as_yaml_to_file(
         project_a_directory / "config.yml", project_a_imports
     )
 
-    project_b_directory = root / "Project B"
+    project_b_directory = tmp_path / "Project B"
     project_b_directory.mkdir()
     project_b_imports = {"imports": ["../Project C"]}
     utils.dump_obj_as_yaml_to_file(
@@ -173,87 +169,106 @@ def test_import_outside_project_directory(tmpdir_factory):
 
     actual = MultiProjectImporter(str(project_a_directory / "config.yml"))
 
-    assert actual._imports == [str(project_b_directory), str(root / "Project C")]
+    assert actual._imports == [str(project_b_directory), str(tmp_path / "Project C")]
 
 
-def test_importing_additional_files(tmpdir_factory):
-    root = tmpdir_factory.mktemp("Parent Bot")
+def test_importing_additional_files(tmp_path: Path):
     config = {"imports": ["bots/Bot A"]}
-    config_path = str(root / "config.yml")
+    config_path = str(tmp_path / "config.yml")
     utils.dump_obj_as_yaml_to_file(config_path, config)
 
-    additional_file = root / "directory" / "file.md"
+    additional_file = tmp_path / "directory" / "file.md"
+    additional_file.parent.mkdir()
 
     # create intermediate directories and fake files
-    additional_file.write("""## story""", ensure=True)
+    rasa.utils.io.write_text_file("""## story""", additional_file)
     selector = MultiProjectImporter(
-        config_path, training_data_paths=[str(root / "directory"), str(additional_file)]
+        config_path,
+        training_data_paths=[str(tmp_path / "directory"), str(additional_file)],
     )
 
     assert selector.is_imported(str(additional_file))
     assert str(additional_file) in selector._story_paths
 
 
-def test_not_importing_not_relevant_additional_files(tmpdir_factory):
-    root = tmpdir_factory.mktemp("Parent Bot")
+def test_not_importing_not_relevant_additional_files(tmp_path: Path):
     config = {"imports": ["bots/Bot A"]}
-    config_path = str(root / "config.yml")
+    config_path = str(tmp_path / "config.yml")
     utils.dump_obj_as_yaml_to_file(config_path, config)
 
-    additional_file = root / "directory" / "file.yml"
+    additional_file = tmp_path / "directory" / "file.yml"
+    additional_file.parent.mkdir()
+
     selector = MultiProjectImporter(
-        config_path, training_data_paths=[str(root / "data"), str(additional_file)]
+        config_path, training_data_paths=[str(tmp_path / "data"), str(additional_file)]
     )
 
-    not_relevant_file1 = root / "data" / "another directory" / "file.yml"
-    not_relevant_file1.write({}, ensure=True)
-    not_relevant_file2 = root / "directory" / "another_file.yml"
-    not_relevant_file2.write({}, ensure=True)
+    not_relevant_file1 = tmp_path / "data" / "another directory" / "file.yml"
+    not_relevant_file1.parent.mkdir(parents=True)
+    rasa.utils.io.write_text_file("", not_relevant_file1)
+    not_relevant_file2 = tmp_path / "directory" / "another_file.yml"
+    rasa.utils.io.write_text_file("", not_relevant_file2)
 
     assert not selector.is_imported(str(not_relevant_file1))
     assert not selector.is_imported(str(not_relevant_file2))
 
 
+@pytest.mark.parametrize(
+    "test_stories_filename,test_story",
+    [
+        (
+            "test_stories.yml",
+            """
+        stories:
+        - story: story test
+          steps:
+          - user: hello
+            intent: greet
+          - action: utter_greet
+        """,
+        ),
+        (
+            "conversation_tests.md",
+            """
+        ## story test
+        * greet : "hello"
+            - utter_greet
+        """,
+        ),
+    ],
+)
 async def test_only_getting_e2e_conversation_tests_if_e2e_enabled(
-    tmpdir_factory: TempdirFactory,
+    tmp_path: Path, test_stories_filename: Text, test_story: Text
 ):
-    from rasa.core.interpreter import RegexInterpreter
     from rasa.core.training.structures import StoryGraph
     import rasa.core.training.loading as core_loading
 
-    root = tmpdir_factory.mktemp("Parent Bot")
     config = {"imports": ["bots/Bot A"]}
-    config_path = str(root / "config.yml")
+    config_path = str(tmp_path / "config.yml")
     utils.dump_obj_as_yaml_to_file(config_path, config)
 
-    story_file = root / "bots" / "Bot A" / "data" / "stories.md"
-    story_file.write(
+    story_file = tmp_path / "bots" / "Bot A" / "data" / "stories.md"
+    story_file.parent.mkdir(parents=True)
+    rasa.utils.io.write_text_file(
         """
         ## story
         * greet
             - utter_greet
         """,
-        ensure=True,
+        story_file,
     )
 
-    e2e_story_test_file = (
-        root / "bots" / "Bot A" / DEFAULT_E2E_TESTS_PATH / "conversation_tests.md"
-    )
-    e2e_story_test_file.write(
-        """
-        ## story test
-        * greet : "hello"
-            - utter_greet
-        """,
-        ensure=True,
+    story_test_file = (
+        tmp_path / "bots" / "Bot A" / DEFAULT_E2E_TESTS_PATH / test_stories_filename
     )
+    story_test_file.parent.mkdir(parents=True)
+    rasa.utils.io.write_text_file(test_story, story_test_file)
 
     selector = MultiProjectImporter(config_path)
 
     story_steps = await core_loading.load_data_from_resource(
-        resource=str(e2e_story_test_file),
+        resource=str(story_test_file),
         domain=Domain.empty(),
-        interpreter=RegexInterpreter(),
         template_variables=None,
         use_e2e=True,
         exclusion_percentage=None,
@@ -266,46 +281,36 @@ async def test_only_getting_e2e_conversation_tests_if_e2e_enabled(
     assert expected.as_story_string() == actual.as_story_string()
 
 
-def test_not_importing_e2e_conversation_tests_in_project(
-    tmpdir_factory: TempdirFactory,
-):
-    root = tmpdir_factory.mktemp("Parent Bot")
+def test_not_importing_e2e_conversation_tests_in_project(tmp_path: Path,):
     config = {"imports": ["bots/Bot A"]}
-    config_path = str(root / "config.yml")
+    config_path = str(tmp_path / "config.yml")
     utils.dump_obj_as_yaml_to_file(config_path, config)
 
-    story_file = root / "bots" / "Bot A" / "data" / "stories.md"
-    story_file.write("""## story""", ensure=True)
+    story_file = tmp_path / "bots" / "Bot A" / "data" / "stories.md"
+    story_file.parent.mkdir(parents=True)
+    rasa.utils.io.write_text_file("""## story""", story_file)
 
-    e2e_story_test_file = (
-        root / "bots" / "Bot A" / DEFAULT_E2E_TESTS_PATH / "conversation_tests.md"
+    story_test_file = (
+        tmp_path / "bots" / "Bot A" / DEFAULT_E2E_TESTS_PATH / "test_stories.yml"
     )
-    e2e_story_test_file.write("""## story test""", ensure=True)
+    story_test_file.parent.mkdir(parents=True)
+    rasa.utils.io.write_text_file("""stories:""", story_test_file)
 
     selector = MultiProjectImporter(config_path)
 
     # Conversation tests should not be included in story paths
-    expected = {
-        "story_paths": [str(story_file)],
-        "e2e_story_paths": [str(e2e_story_test_file)],
-    }
-
-    actual = {
-        "story_paths": selector._story_paths,
-        "e2e_story_paths": selector._e2e_story_paths,
-    }
-
-    assert expected == actual
+    assert [str(story_file)] == selector._story_paths
+    assert [str(story_test_file)] == selector._e2e_story_paths
 
 
-def test_single_additional_file(tmpdir_factory):
-    root = tmpdir_factory.mktemp("Parent Bot")
-    config_path = str(root / "config.yml")
+def test_single_additional_file(tmp_path: Path):
+    config_path = str(tmp_path / "config.yml")
     empty_config = {}
     utils.dump_obj_as_yaml_to_file(config_path, empty_config)
 
-    additional_file = root / "directory" / "file.yml"
-    additional_file.write({}, ensure=True)
+    additional_file = tmp_path / "directory" / "file.yml"
+    additional_file.parent.mkdir()
+    rasa.utils.io.write_yaml({}, additional_file)
 
     selector = MultiProjectImporter(
         config_path, training_data_paths=str(additional_file)
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index cd059da3095e..cf17d3e6ffea 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import numpy as np
 import pytest
 from unittest.mock import Mock
@@ -100,13 +102,13 @@ def test_check_labels_features_exist(messages, expected):
 
 
 async def _train_persist_load_with_different_settings(
-    pipeline, component_builder, tmpdir
+    pipeline, component_builder, tmp_path
 ):
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})
 
     (trainer, trained, persisted_path) = await train(
         _config,
-        path=tmpdir.strpath,
+        path=str(tmp_path),
         data="data/examples/rasa/demo-rasa-multi-intent.md",
         component_builder=component_builder,
     )
@@ -150,7 +152,7 @@ async def test_train_persist_load_with_different_settings(component_builder, tmp
     )
 
 
-async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
+async def test_raise_error_on_incorrect_pipeline(component_builder, tmp_path: Path):
     _config = RasaNLUModelConfig(
         {
             "pipeline": [
@@ -164,7 +166,7 @@ async def test_raise_error_on_incorrect_pipeline(component_builder, tmpdir):
     with pytest.raises(Exception) as e:
         await train(
             _config,
-            path=tmpdir.strpath,
+            path=str(tmp_path),
             data=DEFAULT_DATA_PATH,
             component_builder=component_builder,
         )
@@ -216,7 +218,7 @@ def as_pipeline(*components):
 )
 async def test_softmax_normalization(
     component_builder,
-    tmpdir,
+    tmp_path,
     classifier_params,
     data_path,
     output_length,
@@ -231,7 +233,7 @@ async def test_softmax_normalization(
     _config = RasaNLUModelConfig({"pipeline": pipeline})
     (trained_model, _, persisted_path) = await train(
         _config,
-        path=tmpdir.strpath,
+        path=str(tmp_path),
         data=data_path,
         component_builder=component_builder,
     )
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index cdd86c43c1ad..11ec0e1506fc 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -38,36 +38,6 @@ def blank_config() -> RasaNLUModelConfig:
     return RasaNLUModelConfig({"language": "en", "pipeline": []})
 
 
-@pytest.fixture(scope="session")
-def config_path() -> Text:
-    return write_file_config(
-        {
-            "language": "en",
-            "pipeline": [
-                {"name": "WhitespaceTokenizer"},
-                {"name": "CRFEntityExtractor", EPOCHS: 1, RANDOM_SEED: 42},
-                {"name": "CountVectorsFeaturizer"},
-                {"name": "DIETClassifier", EPOCHS: 1, RANDOM_SEED: 42},
-            ],
-        }
-    ).name
-
-
-@pytest.fixture(scope="session")
-def config_path_duplicate() -> Text:
-    return write_file_config(
-        {
-            "language": "en",
-            "pipeline": [
-                {"name": "WhitespaceTokenizer"},
-                {"name": "CRFEntityExtractor", EPOCHS: 1, RANDOM_SEED: 42},
-                {"name": "CountVectorsFeaturizer"},
-                {"name": "DIETClassifier", EPOCHS: 1, RANDOM_SEED: 42},
-            ],
-        }
-    ).name
-
-
 @pytest.fixture()
 def pretrained_embeddings_spacy_config() -> RasaNLUModelConfig:
     return RasaNLUModelConfig(
diff --git a/tests/nlu/test_evaluation.py b/tests/nlu/test_evaluation.py
index 8d0d607e2ece..313e299f829e 100644
--- a/tests/nlu/test_evaluation.py
+++ b/tests/nlu/test_evaluation.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from sanic.request import Request
 from typing import Text, Iterator, List, Dict, Any
 
@@ -51,12 +53,12 @@
 from tests.nlu.conftest import DEFAULT_DATA_PATH
 from rasa.nlu.selectors.response_selector import ResponseSelector
 from rasa.nlu.test import is_response_selector_present
-from rasa.utils.tensorflow.constants import EPOCHS, ENTITY_RECOGNITION
-
+from rasa.utils.tensorflow.constants import EPOCHS, ENTITY_RECOGNITION, RANDOM_SEED
 
 # https://github.com/pytest-dev/pytest-asyncio/issues/68
 # this event_loop is used by pytest-asyncio, and redefining it
 # is currently the only way of changing the scope of this fixture
+from tests.nlu.utilities import write_file_config
 
 
 @pytest.yield_fixture(scope="session")
@@ -439,9 +441,10 @@ def test_response_selector_present():
     assert not is_response_selector_present(interpreter_without_response_selector)
 
 
-def test_intent_evaluation_report(tmpdir_factory):
-    path = tmpdir_factory.mktemp("evaluation").strpath
-    report_folder = os.path.join(path, "reports")
+def test_intent_evaluation_report(tmp_path: Path):
+    path = tmp_path / "evaluation"
+    path.mkdir()
+    report_folder = str(path / "reports")
     report_filename = os.path.join(report_folder, "intent_report.json")
 
     rasa.utils.io.create_directory(report_folder)
@@ -486,8 +489,9 @@ def test_intent_evaluation_report(tmpdir_factory):
     assert os.path.exists(os.path.join(report_folder, "intent_successes.json"))
 
 
-def test_intent_evaluation_report_large(tmpdir_factory: TempdirFactory):
-    path = tmpdir_factory.mktemp("evaluation")
+def test_intent_evaluation_report_large(tmp_path: Path):
+    path = tmp_path / "evaluation"
+    path.mkdir()
     report_folder = path / "reports"
     report_filename = report_folder / "intent_report.json"
 
@@ -509,7 +513,7 @@ def incorrect(label: Text, _label: Text) -> IntentEvaluationResult:
 
     evaluate_intents(
         intent_results,
-        report_folder,
+        str(report_folder),
         successes=False,
         errors=False,
         disable_plotting=True,
@@ -541,9 +545,10 @@ def incorrect(label: Text, _label: Text) -> IntentEvaluationResult:
     assert report["C"]["confused_with"] == c_confused_with
 
 
-def test_response_evaluation_report(tmpdir_factory):
-    path = tmpdir_factory.mktemp("evaluation").strpath
-    report_folder = os.path.join(path, "reports")
+def test_response_evaluation_report(tmp_path: Path):
+    path = tmp_path / "evaluation"
+    path.mkdir()
+    report_folder = str(path / "reports")
     report_filename = os.path.join(report_folder, "response_selection_report.json")
 
     rasa.utils.io.create_directory(report_folder)
@@ -629,7 +634,7 @@ def test_get_entity_extractors(components, expected_extractors):
     assert extractors == expected_extractors
 
 
-def test_entity_evaluation_report(tmpdir_factory):
+def test_entity_evaluation_report(tmp_path):
     class EntityExtractorA(EntityExtractor):
 
         provides = ["entities"]
@@ -646,8 +651,9 @@ def __init__(self, component_config=None) -> None:
 
             super().__init__(component_config)
 
-    path = tmpdir_factory.mktemp("evaluation").strpath
-    report_folder = os.path.join(path, "reports")
+    path = tmp_path / "evaluation"
+    path.mkdir()
+    report_folder = str(path / "reports")
 
     report_filename_a = os.path.join(report_folder, "EntityExtractorA_report.json")
     report_filename_b = os.path.join(report_folder, "EntityExtractorB_report.json")
@@ -845,12 +851,23 @@ def test_label_replacement():
     assert substitute_labels(original_labels, "O", "no_entity") == target_labels
 
 
-def test_nlu_comparison(tmpdir, config_path, config_path_duplicate):
+def test_nlu_comparison(tmp_path: Path):
+    config = {
+        "language": "en",
+        "pipeline": [
+            {"name": "WhitespaceTokenizer"},
+            {"name": "KeywordIntentClassifier"},
+            {"name": "RegexEntityExtractor"},
+        ],
+    }
     # the configs need to be at a different path, otherwise the results are
     # combined on the same dictionary key and cannot be plotted properly
-    configs = [config_path, config_path_duplicate]
+    configs = [
+        write_file_config(config).name,
+        write_file_config(config).name,
+    ]
 
-    output = tmpdir.strpath
+    output = str(tmp_path)
     compare_nlu_models(
         configs, DEFAULT_DATA_PATH, output, runs=2, exclusion_percentages=[50, 80]
     )
diff --git a/tests/nlu/test_persistor.py b/tests/nlu/test_persistor.py
index 25ce5cc76161..6e4016a91878 100644
--- a/tests/nlu/test_persistor.py
+++ b/tests/nlu/test_persistor.py
@@ -13,7 +13,7 @@ class Object:
 
 
 # noinspection PyPep8Naming
-async def test_list_method_method_in_AWS_persistor(component_builder, tmpdir):
+async def test_list_method_method_in_AWS_persistor(component_builder, tmp_path):
     with mock_s3():
         # artificially create a persisted model
         _config = RasaNLUModelConfig(
@@ -26,7 +26,7 @@ async def test_list_method_method_in_AWS_persistor(component_builder, tmpdir):
         (trained, _, persisted_path) = await train(
             _config,
             data="data/test/demo-rasa-small.json",
-            path=tmpdir.strpath,
+            path=str(tmp_path),
             storage="aws",
             component_builder=component_builder,
         )
diff --git a/tests/nlu/tokenizers/test_jieba_tokenizer.py b/tests/nlu/tokenizers/test_jieba_tokenizer.py
index 426215541587..45894b10d2fe 100644
--- a/tests/nlu/tokenizers/test_jieba_tokenizer.py
+++ b/tests/nlu/tokenizers/test_jieba_tokenizer.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from unittest.mock import patch
 
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
@@ -33,8 +34,8 @@ def test_jieba(text, expected_tokens, expected_indices):
     assert [t.end for t in tokens] == [i[1] for i in expected_indices]
 
 
-def test_jieba_load_dictionary(tmpdir_factory):
-    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath
+def test_jieba_load_dictionary(tmp_path: Path):
+    dictionary_path = str(tmp_path)
 
     component_config = {"dictionary_path": dictionary_path}
 
diff --git a/tests/nlu/training_data/test_entities_parser.py b/tests/nlu/training_data/test_entities_parser.py
index c84ec01b068c..6573e5d5f416 100644
--- a/tests/nlu/training_data/test_entities_parser.py
+++ b/tests/nlu/training_data/test_entities_parser.py
@@ -113,3 +113,26 @@ def test_markdown_entity_regex(
 
     replaced_text = entities_parser.replace_entities(example)
     assert replaced_text == expected_text
+
+
+def test_parse_training_example():
+    message = entities_parser.parse_training_example("Hello!", intent="greet")
+    assert message.get("intent") == "greet"
+    assert message.text == "Hello!"
+
+
+def test_parse_empty_example():
+    message = entities_parser.parse_training_example("")
+    assert message.get("intent") is None
+    assert message.text == ""
+
+
+def test_parse_training_example_with_entities():
+    message = entities_parser.parse_training_example(
+        "I am from [Berlin](city).", intent="inform"
+    )
+    assert message.get("intent") == "inform"
+    assert message.text == "I am from Berlin."
+    assert message.get("entities") == [
+        {"start": 10, "end": 16, "value": "Berlin", "entity": "city"}
+    ]
diff --git a/tests/test_data.py b/tests/test_data.py
index 22b3f9ad0052..e08bdda02a80 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -3,6 +3,8 @@
 
 from pathlib import Path
 
+import pytest
+
 from rasa.constants import DEFAULT_E2E_TESTS_PATH
 from rasa import data
 from rasa.utils.io import write_text_file
@@ -34,7 +36,19 @@ def test_default_story_files_are_story_files():
         assert data.is_story_file(fn)
 
 
-def test_default_conversation_tests_are_conversation_tests(tmpdir: Path):
+def test_default_conversation_tests_are_conversation_tests_yml(tmpdir: Path):
+    parent = tmpdir / DEFAULT_E2E_TESTS_PATH
+    Path(parent).mkdir(parents=True)
+
+    e2e_path = parent / "test_stories.yml"
+    e2e_story = """stories:"""
+    write_text_file(e2e_story, e2e_path)
+
+    assert data.is_test_stories_file(str(e2e_path))
+
+
+def test_default_conversation_tests_are_conversation_tests_md(tmpdir: Path):
+    # can be removed once conversation tests MD support is removed
     parent = tmpdir / DEFAULT_E2E_TESTS_PATH
     Path(parent).mkdir(parents=True)
 
@@ -42,7 +56,7 @@ def test_default_conversation_tests_are_conversation_tests(tmpdir: Path):
     e2e_story = """## my story test"""
     write_text_file(e2e_story, e2e_path)
 
-    assert data.is_end_to_end_conversation_test_file(str(e2e_path))
+    assert data.is_test_stories_file(str(e2e_path))
 
 
 def test_nlu_data_files_are_not_conversation_tests(tmpdir: Path):
@@ -58,7 +72,7 @@ def test_nlu_data_files_are_not_conversation_tests(tmpdir: Path):
     """
     write_text_file(nlu_data, nlu_path)
 
-    assert not data.is_end_to_end_conversation_test_file(str(nlu_path))
+    assert not data.is_test_stories_file(str(nlu_path))
 
 
 def test_domain_files_are_not_conversation_tests(tmpdir: Path):
@@ -67,4 +81,44 @@ def test_domain_files_are_not_conversation_tests(tmpdir: Path):
 
     domain_path = parent / "domain.yml"
 
-    assert not data.is_end_to_end_conversation_test_file(str(domain_path))
+    assert not data.is_test_stories_file(str(domain_path))
+
+
+@pytest.mark.parametrize(
+    "path,is_yaml",
+    [
+        ("my_file.yaml", True),
+        ("my_file.yml", True),
+        ("/a/b/c/my_file.yml", True),
+        ("/a/b/c/my_file.ml", False),
+        ("my_file.md", False),
+    ],
+)
+def test_is_yaml_file(path, is_yaml):
+    assert data.is_likely_yaml_file(path) == is_yaml
+
+
+@pytest.mark.parametrize(
+    "path,is_md",
+    [
+        ("my_file.md", True),
+        ("/a/b/c/my_file.md", True),
+        ("/a/b/c/my_file.yml", False),
+        ("my_file.yaml", False),
+    ],
+)
+def test_is_md_file(path, is_md):
+    assert data.is_likely_markdown_file(path) == is_md
+
+
+@pytest.mark.parametrize(
+    "path,is_json",
+    [
+        ("my_file.json", True),
+        ("/a/b/c/my_file.json", True),
+        ("/a/b/c/my_file.yml", False),
+        ("my_file.md", False),
+    ],
+)
+def test_is_json_file(path, is_json):
+    assert data.is_likely_json_file(path) == is_json
diff --git a/tests/test_server.py b/tests/test_server.py
index ac06dae9f098..c9536bad5b09 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -370,6 +370,7 @@ def test_train_stack_success(
     default_stories_file: Text,
     default_stack_config: Text,
     default_nlu_data: Text,
+    tmp_path: Path,
 ):
     with ExitStack() as stack:
         domain_file = stack.enter_context(open(default_domain_path))
@@ -390,8 +391,7 @@ def test_train_stack_success(
     assert response.headers["filename"] is not None
 
     # save model to temporary file
-    tempdir = tempfile.mkdtemp()
-    model_path = os.path.join(tempdir, "model.tar.gz")
+    model_path = str(tmp_path / "model.tar.gz")
     with open(model_path, "wb") as f:
         f.write(response.body)
 
@@ -405,6 +405,7 @@ def test_train_nlu_success(
     default_stack_config: Text,
     default_nlu_data: Text,
     default_domain_path: Text,
+    tmp_path: Path,
 ):
     domain_data = rasa_utils.io.read_yaml_file(default_domain_path)
     config_data = rasa_utils.io.read_yaml_file(default_stack_config)
@@ -426,8 +427,7 @@ def test_train_nlu_success(
     assert response.status == 200
 
     # save model to temporary file
-    tempdir = tempfile.mkdtemp()
-    model_path = os.path.join(tempdir, "model.tar.gz")
+    model_path = str(tmp_path / "model.tar.gz")
     with open(model_path, "wb") as f:
         f.write(response.body)
 
@@ -441,6 +441,7 @@ def test_train_core_success(
     default_stack_config: Text,
     default_stories_file: Text,
     default_domain_path: Text,
+    tmp_path: Path,
 ):
     with ExitStack() as stack:
         domain_file = stack.enter_context(open(default_domain_path))
@@ -457,8 +458,7 @@ def test_train_core_success(
     assert response.status == 200
 
     # save model to temporary file
-    tempdir = tempfile.mkdtemp()
-    model_path = os.path.join(tempdir, "model.tar.gz")
+    model_path = str(tmp_path / "model.tar.gz")
     with open(model_path, "wb") as f:
         f.write(response.body)
 
@@ -468,7 +468,7 @@ def test_train_core_success(
 
 
 def test_train_with_retrieval_events_success(
-    rasa_app: SanicTestClient, default_stack_config: Text
+    rasa_app: SanicTestClient, default_stack_config: Text, tmp_path: Path
 ):
     with ExitStack() as stack:
         domain_file = stack.enter_context(
@@ -493,13 +493,12 @@ def test_train_with_retrieval_events_success(
 
     _, response = rasa_app.post("/model/train", json=payload)
     assert response.status == 200
-    assert_trained_model(response.body)
+    assert_trained_model(response.body, tmp_path)
 
 
-def assert_trained_model(response_body: bytes) -> None:
+def assert_trained_model(response_body: bytes, tmp_path: Path) -> None:
     # save model to temporary file
-    tempdir = tempfile.mkdtemp()
-    model_path = os.path.join(tempdir, "model.tar.gz")
+    model_path = str(tmp_path / "model.tar.gz")
     with open(model_path, "wb") as f:
         f.write(response_body)
 
@@ -534,7 +533,7 @@ def test_deprecation_warnings_json_payload(payload: Dict):
         rasa.server._validate_json_training_payload(payload)
 
 
-def test_train_with_yaml(rasa_app: SanicTestClient):
+def test_train_with_yaml(rasa_app: SanicTestClient, tmp_path: Path):
     training_data = """
 stories:
 - story: My story
@@ -580,7 +579,7 @@ def test_train_with_yaml(rasa_app: SanicTestClient):
     )
 
     assert response.status == 200
-    assert_trained_model(response.body)
+    assert_trained_model(response.body, tmp_path)
 
 
 def test_train_with_invalid_yaml(rasa_app: SanicTestClient):
@@ -706,6 +705,7 @@ def test_evaluate_stories_end_to_end(
         "is_end_to_end_evaluation",
     }
     assert js["is_end_to_end_evaluation"]
+    assert js["actions"] != []
     assert set(js["actions"][0].keys()) == {
         "action",
         "predicted",
diff --git a/tests/test_test.py b/tests/test_test.py
index 126ad90836d4..9e72e009e6ca 100644
--- a/tests/test_test.py
+++ b/tests/test_test.py
@@ -1,17 +1,24 @@
 import asyncio
 import sys
 from pathlib import Path
+import textwrap
 from typing import Text
-from unittest.mock import Mock
 
 import pytest
 from _pytest.capture import CaptureFixture
 from _pytest.monkeypatch import MonkeyPatch
 
+import rasa.utils.io
+from rasa.core.events import UserUttered
+from rasa.core.test import (
+    EvaluationStore,
+    WronglyClassifiedUserUtterance,
+    WronglyPredictedAction,
+)
+from rasa.core.trackers import DialogueStateTracker
+from rasa.core.training.story_writer.yaml_story_writer import YAMLStoryWriter
 import rasa.model
 import rasa.cli.utils
-from rasa.core.agent import Agent
-from rasa.core.interpreter import RasaNLUInterpreter, RegexInterpreter
 from rasa.nlu.test import NO_ENTITY
 import rasa.core
 
@@ -135,8 +142,8 @@ def test_get_label_set(targets, exclude_label, expected):
     assert set(expected) == set(actual)
 
 
-async def test_interpreter_passed_to_agent(
-    monkeypatch: MonkeyPatch, trained_rasa_model: Text
+async def test_e2e_warning_if_no_nlu_model(
+    monkeypatch: MonkeyPatch, trained_core_model: Text, capsys: CaptureFixture
 ):
     from rasa.test import test_core
 
@@ -146,34 +153,51 @@ async def test_interpreter_passed_to_agent(
         sys.modules["rasa.test"], "_test_core", asyncio.coroutine(lambda *_, **__: True)
     )
 
-    agent_load = Mock()
-    monkeypatch.setattr(Agent, "load", agent_load)
-
-    test_core(trained_rasa_model)
+    test_core(trained_core_model, additional_arguments={"e2e": True})
 
-    agent_load.assert_called_once()
-    _, _, kwargs = agent_load.mock_calls[0]
-    assert isinstance(kwargs["interpreter"], RasaNLUInterpreter)
+    assert "No NLU model found. Using default" in capsys.readouterr().out
 
 
-async def test_e2e_warning_if_no_nlu_model(
-    monkeypatch: MonkeyPatch, trained_core_model: Text, capsys: CaptureFixture
-):
-    from rasa.test import test_core
-
-    # Patching is bit more complicated as we have a module `train` and function
-    # with the same name 😬
-    monkeypatch.setattr(
-        sys.modules["rasa.test"], "_test_core", asyncio.coroutine(lambda *_, **__: True)
+def test_write_classification_errors():
+    evaluation = EvaluationStore(
+        action_predictions=["utter_goodbye"],
+        action_targets=["utter_greet"],
+        intent_predictions=["goodbye"],
+        intent_targets=["greet"],
+        entity_predictions=None,
+        entity_targets=None,
+    )
+    events = [
+        WronglyClassifiedUserUtterance(
+            UserUttered("Hello", {"name": "goodbye"}), evaluation
+        ),
+        WronglyPredictedAction("utter_greet", "utter_goodbye"),
+    ]
+    tracker = DialogueStateTracker.from_events("default", events)
+    dump = YAMLStoryWriter().dumps(tracker.as_story().story_steps)
+    assert (
+        dump.strip()
+        == textwrap.dedent(
+            """
+        version: "2.0"
+        stories:
+        - story: default
+          steps:
+          - intent: greet  # predicted: goodbye: Hello
+            user: |-
+              Hello
+          - action: utter_greet  # predicted: utter_goodbye
+
+    """
+        ).strip()
     )
 
-    agent_load = Mock()
-    monkeypatch.setattr(Agent, "load", agent_load)
 
-    test_core(trained_core_model, additional_arguments={"e2e": True})
+def test_log_failed_stories(tmp_path: Path):
+    path = str(tmp_path / "stories.yml")
+    rasa.core.test._log_stories([], path)
 
-    assert "No NLU model found. Using default" in capsys.readouterr().out
+    dump = rasa.utils.io.read_file(path)
 
-    agent_load.assert_called_once()
-    _, _, kwargs = agent_load.mock_calls[0]
-    assert isinstance(kwargs["interpreter"], RegexInterpreter)
+    assert dump.startswith("#")
+    assert len(dump.split("\n")) == 1
diff --git a/tests/test_train.py b/tests/test_train.py
index 73fd2582301d..bd772e40c0c4 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -74,15 +74,18 @@ def count_temp_rasa_files(directory: Text) -> int:
 
 
 def test_train_temp_files(
-    tmp_path: Text,
+    tmp_path: Path,
     monkeypatch: MonkeyPatch,
     default_domain_path: Text,
     default_stories_file: Text,
     default_stack_config: Text,
     default_nlu_data: Text,
 ):
-    monkeypatch.setattr(tempfile, "tempdir", tmp_path)
-    output = "test_train_temp_files_models"
+    (tmp_path / "training").mkdir()
+    (tmp_path / "models").mkdir()
+
+    monkeypatch.setattr(tempfile, "tempdir", tmp_path / "training")
+    output = str(tmp_path / "models")
 
     train(
         default_domain_path,
@@ -108,36 +111,40 @@ def test_train_temp_files(
 
 
 def test_train_core_temp_files(
-    tmp_path: Text,
+    tmp_path: Path,
     monkeypatch: MonkeyPatch,
     default_domain_path: Text,
     default_stories_file: Text,
     default_stack_config: Text,
 ):
-    monkeypatch.setattr(tempfile, "tempdir", tmp_path)
+    (tmp_path / "training").mkdir()
+    (tmp_path / "models").mkdir()
+
+    monkeypatch.setattr(tempfile, "tempdir", tmp_path / "training")
 
     train_core(
         default_domain_path,
         default_stack_config,
         default_stories_file,
-        output="test_train_core_temp_files_models",
+        output=str(tmp_path / "models"),
     )
 
     assert count_temp_rasa_files(tempfile.tempdir) == 0
 
 
 def test_train_nlu_temp_files(
-    tmp_path: Text,
+    tmp_path: Path,
     monkeypatch: MonkeyPatch,
     default_stack_config: Text,
     default_nlu_data: Text,
 ):
-    monkeypatch.setattr(tempfile, "tempdir", tmp_path)
+    (tmp_path / "training").mkdir()
+    (tmp_path / "models").mkdir()
+
+    monkeypatch.setattr(tempfile, "tempdir", tmp_path / "training")
 
     train_nlu(
-        default_stack_config,
-        default_nlu_data,
-        output="test_train_nlu_temp_files_models",
+        default_stack_config, default_nlu_data, output=str(tmp_path / "models"),
     )
 
     assert count_temp_rasa_files(tempfile.tempdir) == 0
@@ -150,12 +157,13 @@ def test_train_nlu_wrong_format_error_message(
     default_stack_config: Text,
     incorrect_nlu_data: Text,
 ):
-    monkeypatch.setattr(tempfile, "tempdir", tmp_path)
+    (tmp_path / "training").mkdir()
+    (tmp_path / "models").mkdir()
+
+    monkeypatch.setattr(tempfile, "tempdir", tmp_path / "training")
 
     train_nlu(
-        default_stack_config,
-        incorrect_nlu_data,
-        output="test_train_nlu_temp_files_models",
+        default_stack_config, incorrect_nlu_data, output=str(tmp_path / "models"),
     )
 
     captured = capsys.readouterr()
@@ -168,9 +176,12 @@ def test_train_nlu_no_nlu_file_error_message(
     monkeypatch: MonkeyPatch,
     default_stack_config: Text,
 ):
-    monkeypatch.setattr(tempfile, "tempdir", tmp_path)
+    (tmp_path / "training").mkdir()
+    (tmp_path / "models").mkdir()
+
+    monkeypatch.setattr(tempfile, "tempdir", tmp_path / "training")
 
-    train_nlu(default_stack_config, "", output="test_train_nlu_temp_files_models")
+    train_nlu(default_stack_config, "", output=str(tmp_path / "models"))
 
     captured = capsys.readouterr()
     assert "No NLU data given" in captured.out