Merge pull request #7033 from RasaHQ/story-processing

Fixes for story/rules reading and writing for both formats
RasaHQ · Oct 20, 2020 · 93d26fc · 93d26fc
2 parents ca551ca + 4bf0928
commit 93d26fc
Show file tree

Hide file tree

Showing 10 changed files with 115 additions and 18 deletions.
diff --git a/changelog/7033.bugfix.md b/changelog/7033.bugfix.md
@@ -0,0 +1,2 @@
+- Fix `YAMLStoryReader` not being able to represent [`OR` statements](stories.mdx#or-statements) in conversion mode.
+- Fix `MarkdownStoryWriter` not being able to write stories with `OR` statements (when loaded in conversion mode).
diff --git a/rasa/core/training/converters/story_markdown_to_yaml_converter.py b/rasa/core/training/converters/story_markdown_to_yaml_converter.py
@@ -41,12 +41,12 @@ async def convert_and_write(cls, source_path: Path, output_path: Path) -> None:
 
         # check if source file is test stories file
         if MarkdownStoryReader.is_test_stories_file(source_path):
-            reader = MarkdownStoryReader(is_used_for_conversion=True, use_e2e=True)
+            reader = MarkdownStoryReader(is_used_for_training=False, use_e2e=True)
             output_core_path = cls._generate_path_for_converted_test_data_file(
                 source_path, output_path
             )
         else:
-            reader = MarkdownStoryReader(is_used_for_conversion=True)
+            reader = MarkdownStoryReader(is_used_for_training=False)
             output_core_path = cls.generate_path_for_converted_training_data_file(
                 source_path, output_path
             )

diff --git a/rasa/shared/core/training_data/story_reader/markdown_story_reader.py b/rasa/shared/core/training_data/story_reader/markdown_story_reader.py
@@ -186,7 +186,7 @@ def _add_user_messages(self, messages: List[Text], line_num: int) -> None:
             )
         parsed_messages = [self._parse_message(m, line_num) for m in messages]
         self.current_step_builder.add_user_messages(
-            parsed_messages, self.is_used_for_conversion
+            parsed_messages, self._is_used_for_training
         )
 
     def _add_e2e_messages(self, e2e_messages: List[Text], line_num: int) -> None:
@@ -204,7 +204,7 @@ def _add_e2e_messages(self, e2e_messages: List[Text], line_num: int) -> None:
         self.current_step_builder.add_user_messages(parsed_messages)
 
     @staticmethod
-    def parse_e2e_message(line: Text, is_used_for_conversion: bool = False) -> Message:
+    def parse_e2e_message(line: Text, is_used_for_training: bool = True) -> Message:
         """Parses an md list item line based on the current section type.
 
         Matches expressions of the form `<intent>:<example>`. For the
@@ -231,7 +231,7 @@ def parse_e2e_message(line: Text, is_used_for_conversion: bool = False) -> Messa
         intent = match.group(2)
         message = match.group(4)
         example = entities_parser.parse_training_example(message, intent)
-        if is_used_for_conversion:
+        if not is_used_for_training:
             # In case this is a simple conversion from Markdown we should copy over
             # the original text and not parse the entities
             example.data[rasa.shared.nlu.constants.TEXT] = message
@@ -248,7 +248,7 @@ def parse_e2e_message(line: Text, is_used_for_conversion: bool = False) -> Messa
     def _parse_message(self, message: Text, line_num: int) -> UserUttered:
 
         if self.use_e2e:
-            parsed = self.parse_e2e_message(message, self.is_used_for_conversion)
+            parsed = self.parse_e2e_message(message, self._is_used_for_training)
             text = parsed.get("text")
             intent = {INTENT_NAME_KEY: parsed.get("intent")}
             entities = parsed.get("entities")

diff --git a/rasa/shared/core/training_data/story_reader/story_reader.py b/rasa/shared/core/training_data/story_reader/story_reader.py
@@ -28,7 +28,7 @@ def __init__(
         template_vars: Optional[Dict] = None,
         use_e2e: bool = False,
         source_name: Optional[Text] = None,
-        is_used_for_conversion: bool = False,
+        is_used_for_training: bool = True,
     ) -> None:
         """Constructor for the StoryReader.
 
@@ -37,9 +37,9 @@ def __init__(
             template_vars: Template variables to be replaced.
             use_e2e: Specifies whether to use the e2e parser or not.
             source_name: Name of the training data source.
-            is_used_for_conversion: Identifies if the user utterances should be parsed
+            is_used_for_training: Identifies if the user utterances should be parsed
               (entities are extracted and removed from the original text) and
-              OR statements should be unfolded . This parameter is used only to
+              OR statements should be unfolded. This parameter is used only to
               simplify the conversation from MD story files. Don't use it other ways,
               because it ends up in a invalid story that cannot be user for real
               training. Default value is `False`, which preserves the expected behavior
@@ -51,7 +51,7 @@ def __init__(
         self.template_variables = template_vars if template_vars else {}
         self.use_e2e = use_e2e
         self.source_name = source_name
-        self.is_used_for_conversion = is_used_for_conversion
+        self._is_used_for_training = is_used_for_training
         self._is_parsing_conditions = False
 
     def read_from_file(self, filename: Text) -> List[StoryStep]:

diff --git a/rasa/shared/core/training_data/story_reader/story_step_builder.py b/rasa/shared/core/training_data/story_reader/story_step_builder.py
@@ -61,13 +61,13 @@ def _prev_end_checkpoints(self) -> List[Checkpoint]:
             return [Checkpoint(name) for name in end_names]
 
     def add_user_messages(
-        self, messages: List[UserUttered], is_used_for_conversion: bool = False
+        self, messages: List[UserUttered], is_used_for_training: bool = True
     ) -> None:
         """Adds next story steps with the user's utterances.
 
         Args:
             messages: User utterances.
-            is_used_for_conversion: Identifies if the user utterance is a part of
+            is_used_for_training: Identifies if the user utterance is a part of
               OR statement. This parameter is used only to simplify the conversation
               from MD story files. Don't use it other ways, because it ends up
               in a invalid story that cannot be user for real training.
@@ -82,7 +82,7 @@ def add_user_messages(
                 t.add_user_message(messages[0])
         else:
             # this simplifies conversion between formats, but breaks the logic
-            if is_used_for_conversion:
+            if not is_used_for_training:
                 for t in self.current_steps:
                     t.add_events(messages)
                 return

diff --git a/rasa/shared/core/training_data/story_reader/yaml_story_reader.py b/rasa/shared/core/training_data/story_reader/yaml_story_reader.py
@@ -66,7 +66,7 @@ def from_reader(cls, reader: "YAMLStoryReader") -> "YAMLStoryReader":
             reader.template_variables,
             reader.use_e2e,
             reader.source_name,
-            reader.is_used_for_conversion,
+            reader._is_used_for_training,
         )
 
     def read_from_file(self, filename: Union[Text, Path]) -> List[StoryStep]:
@@ -336,7 +336,9 @@ def _parse_or_statement(self, step: Dict[Text, Any]) -> None:
                 )
                 return
 
-        self.current_step_builder.add_user_messages(utterances)
+        self.current_step_builder.add_user_messages(
+            utterances, self._is_used_for_training
+        )
 
     def _user_intent_from_step(self, step: Dict[Text, Any]) -> Text:
         user_intent = step.get(KEY_USER_INTENT, "").strip()

diff --git a/rasa/shared/core/training_data/structures.py b/rasa/shared/core/training_data/structures.py
@@ -17,6 +17,7 @@
     SessionStarted,
 )
 from rasa.shared.core.trackers import DialogueStateTracker  # pytype: disable=pyi-error
+from rasa.shared.exceptions import RasaCoreException
 
 if typing.TYPE_CHECKING:
     import networkx as nx
@@ -41,6 +42,13 @@
 STEP_COUNT = 1
 
 
+class EventTypeError(RasaCoreException, ValueError):
+    """Represents an error caused by a Rasa Core event not being of the expected
+    type.
+
+    """
+
+
 class Checkpoint:
     def __init__(
         self, name: Optional[Text], conditions: Optional[Dict[Text, Any]] = None
@@ -131,6 +139,19 @@ def _user_string(story_step_element: UserUttered, e2e: bool) -> Text:
     def _bot_string(story_step_element: Event) -> Text:
         return f"    - {story_step_element.as_story_string()}\n"
 
+    @staticmethod
+    def _or_string(story_step_element: List[Event], e2e: bool) -> Text:
+        for event in story_step_element:
+            if not isinstance(event, UserUttered):
+                raise EventTypeError(
+                    "OR statement events must be of type `UserUttered`."
+                )
+
+        result = " OR ".join(
+            [element.as_story_string(e2e) for element in story_step_element]
+        )
+        return f"* {result}\n"
+
     def as_story_string(self, flat: bool = False, e2e: bool = False) -> Text:
         # if the result should be flattened, we
         # will exclude the caption and any checkpoints.
@@ -156,6 +177,11 @@ def as_story_string(self, flat: bool = False, e2e: bool = False) -> Text:
                 converted = s.as_story_string()  # pytype: disable=attribute-error
                 if converted:
                     result += self._bot_string(s)
+            elif isinstance(s, list):
+                # The story reader classes support reading stories in
+                # conversion mode.  When this mode is enabled, OR statements
+                # are represented as lists of events.
+                result += self._or_string(s, e2e)
             else:
                 raise Exception(f"Unexpected element in story step: {s}")
 

diff --git a/tests/shared/core/training_data/story_reader/test_yaml_story_reader.py b/tests/shared/core/training_data/story_reader/test_yaml_story_reader.py
@@ -359,3 +359,35 @@ def test_read_mixed_training_data_file(default_domain: Domain):
     with pytest.warns(None) as record:
         reader.read_from_parsed_yaml(yaml_content)
         assert not len(record)
+
+
+def test_or_statement_if_not_training_mode():
+    stories = """
+    stories:
+    - story: hello world
+      steps:
+      - or:
+        - intent: intent1
+        - intent: intent2
+      - action: some_action
+      - intent: intent3
+      - action: other_action
+    """
+
+    reader = YAMLStoryReader(is_used_for_training=False)
+    yaml_content = rasa.shared.utils.io.read_yaml(stories)
+
+    steps = reader.read_from_parsed_yaml(yaml_content)
+
+    assert len(steps) == 1
+
+    assert len(steps[0].events) == 4  # 4 events in total
+    assert len(steps[0].start_checkpoints) == 1
+    assert steps[0].start_checkpoints[0].name == "STORY_START"
+    assert steps[0].end_checkpoints == []
+
+    or_statement = steps[0].events[0]
+    assert isinstance(or_statement, list)  # But first one is a list (OR)
+
+    assert or_statement[0].intent["name"] == "intent1"
+    assert or_statement[1].intent["name"] == "intent2"
diff --git a/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py b/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py
@@ -38,7 +38,7 @@ async def test_simple_story(
 ):
 
     original_md_reader = MarkdownStoryReader(
-        default_domain, None, False, input_md_file, is_used_for_conversion=True
+        default_domain, None, False, input_md_file, is_used_for_training=False
     )
     original_md_story_steps = original_md_reader.read_from_file(input_md_file)
 
@@ -67,7 +67,7 @@ async def test_story_start_checkpoint_is_skipped(default_domain: Domain):
     input_md_file = "data/test_stories/stories.md"
 
     original_md_reader = MarkdownStoryReader(
-        default_domain, None, False, input_md_file, is_used_for_conversion=True
+        default_domain, None, False, input_md_file, is_used_for_training=False
     )
     original_md_story_steps = original_md_reader.read_from_file(input_md_file)
 
@@ -78,7 +78,7 @@ async def test_story_start_checkpoint_is_skipped(default_domain: Domain):
 
 async def test_forms_are_converted(default_domain: Domain):
     original_md_reader = MarkdownStoryReader(
-        default_domain, None, False, is_used_for_conversion=True
+        default_domain, None, False, is_used_for_training=False
     )
     original_md_story_steps = original_md_reader.read_from_file(
         "data/test_stories/stories_form.md"

diff --git a/tests/shared/core/training_data/test_structures.py b/tests/shared/core/training_data/test_structures.py
@@ -32,6 +32,41 @@ def test_session_start_is_not_serialised(default_domain: Domain):
     assert story.as_story_string(flat=True) == expected
 
 
+def test_as_story_string_or_statement():
+    from rasa.shared.core.training_data.story_reader.yaml_story_reader import (
+        YAMLStoryReader,
+    )
+
+    import rasa.shared.utils.io
+
+    stories = """
+    stories:
+    - story: hello world
+      steps:
+      - or:
+        - intent: intent1
+        - intent: intent2
+        - intent: intent3
+      - action: some_action
+    """
+
+    reader = YAMLStoryReader(is_used_for_training=False)
+    yaml_content = rasa.shared.utils.io.read_yaml(stories)
+
+    steps = reader.read_from_parsed_yaml(yaml_content)
+
+    assert len(steps) == 1
+
+    assert (
+        steps[0].as_story_string()
+        == """
+## hello world
+* intent1 OR intent2 OR intent3
+    - some_action
+"""
+    )
+
+
 def test_cap_length():
     assert (
         rasa.shared.core.training_data.structures._cap_length("mystring", 6) == "mys..."
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		- Fix `YAMLStoryReader` not being able to represent [`OR` statements](stories.mdx#or-statements) in conversion mode.
		- Fix `MarkdownStoryWriter` not being able to write stories with `OR` statements (when loaded in conversion mode).