diff --git a/changelog/4612.bugfix.md b/changelog/4612.bugfix.md new file mode 100644 index 000000000000..b022209a77e3 --- /dev/null +++ b/changelog/4612.bugfix.md @@ -0,0 +1 @@ +Fixed the bug that OR statements in stories would break the check whether a model needs to be retrained \ No newline at end of file diff --git a/changelog/7955.improvement.md b/changelog/7955.improvement.md new file mode 100644 index 000000000000..1e58237d0dcf --- /dev/null +++ b/changelog/7955.improvement.md @@ -0,0 +1 @@ +Drastically improved finger printing time for large story graphs \ No newline at end of file diff --git a/data/test_yaml_stories/story_with_two_equal_or_statements.yml b/data/test_yaml_stories/story_with_two_equal_or_statements.yml new file mode 100644 index 000000000000..622f35c4e7ff --- /dev/null +++ b/data/test_yaml_stories/story_with_two_equal_or_statements.yml @@ -0,0 +1,25 @@ +stories: +- story: story_with_two_equal_or_statements + steps: + - intent: simple + - action: utter_default + - or: + - intent: affirm + - intent: thank_you + - action: utter_greet + - action: utter_default + - or: + - intent: affirm + - intent: thank_you + - action: utter_goodbye +# same name again, same events, same intents, but with entities +- story: story_with_two_equal_or_statements + steps: + - intent: simple + - action: utter_default + - or: + - intent: affirm + entities: + - name: peter + - intent: thank_you + - action: utter_greet diff --git a/rasa/shared/core/training_data/loading.py b/rasa/shared/core/training_data/loading.py index f788be23ec76..38483c0f5063 100644 --- a/rasa/shared/core/training_data/loading.py +++ b/rasa/shared/core/training_data/loading.py @@ -54,7 +54,7 @@ def _guess_reader( async def load_data_from_resource( - resource: Union[Text, Path], + resource: Union[Text], domain: Domain, template_variables: Optional[Dict] = None, use_e2e: bool = False, diff --git a/rasa/shared/core/training_data/story_reader/story_step_builder.py b/rasa/shared/core/training_data/story_reader/story_step_builder.py index fcb9249f4416..631237d39d64 100644 --- a/rasa/shared/core/training_data/story_reader/story_step_builder.py +++ b/rasa/shared/core/training_data/story_reader/story_step_builder.py @@ -91,9 +91,8 @@ def add_user_messages( # user can use the express the same thing # we need to copy the blocks and create one # copy for each possible message - prefix = GENERATED_CHECKPOINT_PREFIX + "OR_" - generated_checkpoint = rasa.shared.core.training_data.structures.generate_id( - prefix, GENERATED_HASH_LENGTH + generated_checkpoint = self._generate_checkpoint_name_for_or_statement( + messages ) updated_steps = [] for t in self.current_steps: @@ -143,3 +142,35 @@ def _next_story_steps(self) -> List[StoryStep]: ) ] return current_turns + + def _generate_checkpoint_name_for_or_statement( + self, messages: List[UserUttered] + ) -> str: + """Generates a unique checkpoint name for an or statement. + + The name is based on the current story/rule name, + the current place in the story since the last checkpoint or start, + the name of the starting checkpoints, + and the involved intents/e2e messages. + """ + messages_texts_or_intents = sorted([str(m) for m in messages]) + start_checkpoint_names = sorted( + list({chk.name for s in self.current_steps for chk in s.start_checkpoints}) + ) + events = [str(e) for s in self.current_steps for e in s.events] + # name: to identify the current story or rule + # events: to identify what has happened so far + # within the current story/rule + # start checkpoint names: to identify the section + # within the current story/rule when there are + # multiple internal checkpoints + # messages texts or intents: identifying the members of the or statement + unique_id = ( + f"{self.name}_<>_{'@@@'.join(events)}" + f"_<>_{'@@@'.join(start_checkpoint_names)}" + f"_<>_{'@@@'.join(messages_texts_or_intents)}" + ) + hashed_id = rasa.shared.utils.io.get_text_hash(unique_id)[ + :GENERATED_HASH_LENGTH + ] + return f"{GENERATED_CHECKPOINT_PREFIX}OR_{hashed_id}" diff --git a/rasa/shared/core/training_data/structures.py b/rasa/shared/core/training_data/structures.py index 8ad0d7b62af1..4cafdf61fc53 100644 --- a/rasa/shared/core/training_data/structures.py +++ b/rasa/shared/core/training_data/structures.py @@ -411,8 +411,8 @@ def fingerprint(self) -> Text: YAMLStoryWriter, ) - self_as_string = YAMLStoryWriter().dumps(self.story_steps) - return rasa.shared.utils.io.get_text_hash(self_as_string) + stories_as_yaml = YAMLStoryWriter().stories_to_yaml(self.story_steps) + return rasa.shared.utils.io.deep_container_fingerprint(stories_as_yaml) def ordered_steps(self) -> List[StoryStep]: """Returns the story steps ordered by topological order of the DAG.""" diff --git a/tests/shared/core/training_data/test_graph.py b/tests/shared/core/training_data/test_graph.py index 3086e6fb5fd1..1d6430f8ed22 100644 --- a/tests/shared/core/training_data/test_graph.py +++ b/tests/shared/core/training_data/test_graph.py @@ -1,4 +1,6 @@ from rasa.shared.core.training_data.structures import StoryGraph +import rasa.shared.core.training_data.loading +from rasa.shared.core.domain import Domain def check_graph_is_sorted(g, sorted_nodes, removed_edges): @@ -45,3 +47,43 @@ def test_node_ordering_with_cycle(): def test_is_empty(): assert StoryGraph([]).is_empty() + + +async def test_consistent_fingerprints(): + stories_path = "data/test_yaml_stories/stories.yml" + domain_path = "data/test_domains/default_with_slots.yml" + domain = Domain.load(domain_path) + story_steps = await rasa.shared.core.training_data.loading.load_data_from_resource( + stories_path, domain + ) + story_graph = StoryGraph(story_steps) + + # read again + story_steps_2 = await rasa.shared.core.training_data.loading.load_data_from_resource( + stories_path, domain + ) + story_graph_2 = StoryGraph(story_steps_2) + + fingerprint = story_graph.fingerprint() + fingerprint_2 = story_graph_2.fingerprint() + + assert fingerprint == fingerprint_2 + + +async def test_unique_checkpoint_names(): + stories_path = "data/test_yaml_stories/story_with_two_equal_or_statements.yml" + domain_path = "data/test_domains/default_with_slots.yml" + domain = Domain.load(domain_path) + story_steps = await rasa.shared.core.training_data.loading.load_data_from_resource( + stories_path, domain + ) + start_checkpoint_names = { + chk.name for s in story_steps for chk in s.start_checkpoints + } + + # first story: + # START_CHECKPOINT, GENR_OR_XXXXX for first OR, GENR_OR_YYYYY for second OR + + # additional in second story: + # GENR_OR_ZZZZZ as entities are different from first OR in first story + assert len(start_checkpoint_names) == 4