diff --git a/changelog/7033.bugfix.md b/changelog/7033.bugfix.md new file mode 100644 index 000000000000..a18b83e9eafa --- /dev/null +++ b/changelog/7033.bugfix.md @@ -0,0 +1,2 @@ +- Fix `YAMLStoryReader` not being able to represent [`OR` statements](stories.mdx#or-statements) in conversion mode. +- Fix `MarkdownStoryWriter` not being able to write stories with `OR` statements (when loaded in conversion mode). diff --git a/rasa/core/training/converters/story_markdown_to_yaml_converter.py b/rasa/core/training/converters/story_markdown_to_yaml_converter.py index 2dd557cf23ee..ff51d90d4593 100644 --- a/rasa/core/training/converters/story_markdown_to_yaml_converter.py +++ b/rasa/core/training/converters/story_markdown_to_yaml_converter.py @@ -41,12 +41,12 @@ async def convert_and_write(cls, source_path: Path, output_path: Path) -> None: # check if source file is test stories file if MarkdownStoryReader.is_test_stories_file(source_path): - reader = MarkdownStoryReader(is_used_for_conversion=True, use_e2e=True) + reader = MarkdownStoryReader(is_used_for_training=False, use_e2e=True) output_core_path = cls._generate_path_for_converted_test_data_file( source_path, output_path ) else: - reader = MarkdownStoryReader(is_used_for_conversion=True) + reader = MarkdownStoryReader(is_used_for_training=False) output_core_path = cls.generate_path_for_converted_training_data_file( source_path, output_path ) diff --git a/rasa/shared/core/training_data/story_reader/markdown_story_reader.py b/rasa/shared/core/training_data/story_reader/markdown_story_reader.py index 0dc2e5e05b53..23300510de1e 100644 --- a/rasa/shared/core/training_data/story_reader/markdown_story_reader.py +++ b/rasa/shared/core/training_data/story_reader/markdown_story_reader.py @@ -186,7 +186,7 @@ def _add_user_messages(self, messages: List[Text], line_num: int) -> None: ) parsed_messages = [self._parse_message(m, line_num) for m in messages] self.current_step_builder.add_user_messages( - parsed_messages, self.is_used_for_conversion + parsed_messages, self._is_used_for_training ) def _add_e2e_messages(self, e2e_messages: List[Text], line_num: int) -> None: @@ -204,7 +204,7 @@ def _add_e2e_messages(self, e2e_messages: List[Text], line_num: int) -> None: self.current_step_builder.add_user_messages(parsed_messages) @staticmethod - def parse_e2e_message(line: Text, is_used_for_conversion: bool = False) -> Message: + def parse_e2e_message(line: Text, is_used_for_training: bool = True) -> Message: """Parses an md list item line based on the current section type. Matches expressions of the form `:`. For the @@ -231,7 +231,7 @@ def parse_e2e_message(line: Text, is_used_for_conversion: bool = False) -> Messa intent = match.group(2) message = match.group(4) example = entities_parser.parse_training_example(message, intent) - if is_used_for_conversion: + if not is_used_for_training: # In case this is a simple conversion from Markdown we should copy over # the original text and not parse the entities example.data[rasa.shared.nlu.constants.TEXT] = message @@ -248,7 +248,7 @@ def parse_e2e_message(line: Text, is_used_for_conversion: bool = False) -> Messa def _parse_message(self, message: Text, line_num: int) -> UserUttered: if self.use_e2e: - parsed = self.parse_e2e_message(message, self.is_used_for_conversion) + parsed = self.parse_e2e_message(message, self._is_used_for_training) text = parsed.get("text") intent = {INTENT_NAME_KEY: parsed.get("intent")} entities = parsed.get("entities") diff --git a/rasa/shared/core/training_data/story_reader/story_reader.py b/rasa/shared/core/training_data/story_reader/story_reader.py index 3e8d521898a2..3db7e58f8857 100644 --- a/rasa/shared/core/training_data/story_reader/story_reader.py +++ b/rasa/shared/core/training_data/story_reader/story_reader.py @@ -28,7 +28,7 @@ def __init__( template_vars: Optional[Dict] = None, use_e2e: bool = False, source_name: Optional[Text] = None, - is_used_for_conversion: bool = False, + is_used_for_training: bool = True, ) -> None: """Constructor for the StoryReader. @@ -37,9 +37,9 @@ def __init__( template_vars: Template variables to be replaced. use_e2e: Specifies whether to use the e2e parser or not. source_name: Name of the training data source. - is_used_for_conversion: Identifies if the user utterances should be parsed + is_used_for_training: Identifies if the user utterances should be parsed (entities are extracted and removed from the original text) and - OR statements should be unfolded . This parameter is used only to + OR statements should be unfolded. This parameter is used only to simplify the conversation from MD story files. Don't use it other ways, because it ends up in a invalid story that cannot be user for real training. Default value is `False`, which preserves the expected behavior @@ -51,7 +51,7 @@ def __init__( self.template_variables = template_vars if template_vars else {} self.use_e2e = use_e2e self.source_name = source_name - self.is_used_for_conversion = is_used_for_conversion + self._is_used_for_training = is_used_for_training self._is_parsing_conditions = False def read_from_file(self, filename: Text) -> List[StoryStep]: diff --git a/rasa/shared/core/training_data/story_reader/story_step_builder.py b/rasa/shared/core/training_data/story_reader/story_step_builder.py index dcb2255f82ac..fcb9249f4416 100644 --- a/rasa/shared/core/training_data/story_reader/story_step_builder.py +++ b/rasa/shared/core/training_data/story_reader/story_step_builder.py @@ -61,13 +61,13 @@ def _prev_end_checkpoints(self) -> List[Checkpoint]: return [Checkpoint(name) for name in end_names] def add_user_messages( - self, messages: List[UserUttered], is_used_for_conversion: bool = False + self, messages: List[UserUttered], is_used_for_training: bool = True ) -> None: """Adds next story steps with the user's utterances. Args: messages: User utterances. - is_used_for_conversion: Identifies if the user utterance is a part of + is_used_for_training: Identifies if the user utterance is a part of OR statement. This parameter is used only to simplify the conversation from MD story files. Don't use it other ways, because it ends up in a invalid story that cannot be user for real training. @@ -82,7 +82,7 @@ def add_user_messages( t.add_user_message(messages[0]) else: # this simplifies conversion between formats, but breaks the logic - if is_used_for_conversion: + if not is_used_for_training: for t in self.current_steps: t.add_events(messages) return diff --git a/rasa/shared/core/training_data/story_reader/yaml_story_reader.py b/rasa/shared/core/training_data/story_reader/yaml_story_reader.py index 07cb4dc5cc9e..a0880c63298e 100644 --- a/rasa/shared/core/training_data/story_reader/yaml_story_reader.py +++ b/rasa/shared/core/training_data/story_reader/yaml_story_reader.py @@ -66,7 +66,7 @@ def from_reader(cls, reader: "YAMLStoryReader") -> "YAMLStoryReader": reader.template_variables, reader.use_e2e, reader.source_name, - reader.is_used_for_conversion, + reader._is_used_for_training, ) def read_from_file(self, filename: Union[Text, Path]) -> List[StoryStep]: @@ -336,7 +336,9 @@ def _parse_or_statement(self, step: Dict[Text, Any]) -> None: ) return - self.current_step_builder.add_user_messages(utterances) + self.current_step_builder.add_user_messages( + utterances, self._is_used_for_training + ) def _user_intent_from_step(self, step: Dict[Text, Any]) -> Text: user_intent = step.get(KEY_USER_INTENT, "").strip() diff --git a/rasa/shared/core/training_data/structures.py b/rasa/shared/core/training_data/structures.py index a4c531be4c49..7945287b3621 100644 --- a/rasa/shared/core/training_data/structures.py +++ b/rasa/shared/core/training_data/structures.py @@ -17,6 +17,7 @@ SessionStarted, ) from rasa.shared.core.trackers import DialogueStateTracker # pytype: disable=pyi-error +from rasa.shared.exceptions import RasaCoreException if typing.TYPE_CHECKING: import networkx as nx @@ -41,6 +42,13 @@ STEP_COUNT = 1 +class EventTypeError(RasaCoreException, ValueError): + """Represents an error caused by a Rasa Core event not being of the expected + type. + + """ + + class Checkpoint: def __init__( self, name: Optional[Text], conditions: Optional[Dict[Text, Any]] = None @@ -131,6 +139,19 @@ def _user_string(story_step_element: UserUttered, e2e: bool) -> Text: def _bot_string(story_step_element: Event) -> Text: return f" - {story_step_element.as_story_string()}\n" + @staticmethod + def _or_string(story_step_element: List[Event], e2e: bool) -> Text: + for event in story_step_element: + if not isinstance(event, UserUttered): + raise EventTypeError( + "OR statement events must be of type `UserUttered`." + ) + + result = " OR ".join( + [element.as_story_string(e2e) for element in story_step_element] + ) + return f"* {result}\n" + def as_story_string(self, flat: bool = False, e2e: bool = False) -> Text: # if the result should be flattened, we # will exclude the caption and any checkpoints. @@ -156,6 +177,11 @@ def as_story_string(self, flat: bool = False, e2e: bool = False) -> Text: converted = s.as_story_string() # pytype: disable=attribute-error if converted: result += self._bot_string(s) + elif isinstance(s, list): + # The story reader classes support reading stories in + # conversion mode. When this mode is enabled, OR statements + # are represented as lists of events. + result += self._or_string(s, e2e) else: raise Exception(f"Unexpected element in story step: {s}") diff --git a/tests/shared/core/training_data/story_reader/test_yaml_story_reader.py b/tests/shared/core/training_data/story_reader/test_yaml_story_reader.py index 361ee90f34df..5106bf86b2ca 100644 --- a/tests/shared/core/training_data/story_reader/test_yaml_story_reader.py +++ b/tests/shared/core/training_data/story_reader/test_yaml_story_reader.py @@ -359,3 +359,35 @@ def test_read_mixed_training_data_file(default_domain: Domain): with pytest.warns(None) as record: reader.read_from_parsed_yaml(yaml_content) assert not len(record) + + +def test_or_statement_if_not_training_mode(): + stories = """ + stories: + - story: hello world + steps: + - or: + - intent: intent1 + - intent: intent2 + - action: some_action + - intent: intent3 + - action: other_action + """ + + reader = YAMLStoryReader(is_used_for_training=False) + yaml_content = rasa.shared.utils.io.read_yaml(stories) + + steps = reader.read_from_parsed_yaml(yaml_content) + + assert len(steps) == 1 + + assert len(steps[0].events) == 4 # 4 events in total + assert len(steps[0].start_checkpoints) == 1 + assert steps[0].start_checkpoints[0].name == "STORY_START" + assert steps[0].end_checkpoints == [] + + or_statement = steps[0].events[0] + assert isinstance(or_statement, list) # But first one is a list (OR) + + assert or_statement[0].intent["name"] == "intent1" + assert or_statement[1].intent["name"] == "intent2" diff --git a/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py b/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py index cfcc5ea63a57..fa746263b082 100644 --- a/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py +++ b/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py @@ -38,7 +38,7 @@ async def test_simple_story( ): original_md_reader = MarkdownStoryReader( - default_domain, None, False, input_md_file, is_used_for_conversion=True + default_domain, None, False, input_md_file, is_used_for_training=False ) original_md_story_steps = original_md_reader.read_from_file(input_md_file) @@ -67,7 +67,7 @@ async def test_story_start_checkpoint_is_skipped(default_domain: Domain): input_md_file = "data/test_stories/stories.md" original_md_reader = MarkdownStoryReader( - default_domain, None, False, input_md_file, is_used_for_conversion=True + default_domain, None, False, input_md_file, is_used_for_training=False ) original_md_story_steps = original_md_reader.read_from_file(input_md_file) @@ -78,7 +78,7 @@ async def test_story_start_checkpoint_is_skipped(default_domain: Domain): async def test_forms_are_converted(default_domain: Domain): original_md_reader = MarkdownStoryReader( - default_domain, None, False, is_used_for_conversion=True + default_domain, None, False, is_used_for_training=False ) original_md_story_steps = original_md_reader.read_from_file( "data/test_stories/stories_form.md" diff --git a/tests/shared/core/training_data/test_structures.py b/tests/shared/core/training_data/test_structures.py index f252d4febf27..5794aac4eb9f 100644 --- a/tests/shared/core/training_data/test_structures.py +++ b/tests/shared/core/training_data/test_structures.py @@ -32,6 +32,41 @@ def test_session_start_is_not_serialised(default_domain: Domain): assert story.as_story_string(flat=True) == expected +def test_as_story_string_or_statement(): + from rasa.shared.core.training_data.story_reader.yaml_story_reader import ( + YAMLStoryReader, + ) + + import rasa.shared.utils.io + + stories = """ + stories: + - story: hello world + steps: + - or: + - intent: intent1 + - intent: intent2 + - intent: intent3 + - action: some_action + """ + + reader = YAMLStoryReader(is_used_for_training=False) + yaml_content = rasa.shared.utils.io.read_yaml(stories) + + steps = reader.read_from_parsed_yaml(yaml_content) + + assert len(steps) == 1 + + assert ( + steps[0].as_story_string() + == """ +## hello world +* intent1 OR intent2 OR intent3 + - some_action +""" + ) + + def test_cap_length(): assert ( rasa.shared.core.training_data.structures._cap_length("mystring", 6) == "mys..."