Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consistent and faster stories fingerprinting #8041

Merged
merged 12 commits into from
Mar 3, 2021
1 change: 1 addition & 0 deletions changelog/4612.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed the bug that OR statements in stories would break the check whether a model needs to be retrained
1 change: 1 addition & 0 deletions changelog/7955.improvement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Drastically improved finger printing time for large story graphs
25 changes: 25 additions & 0 deletions data/test_yaml_stories/story_with_two_equal_or_statements.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
stories:
- story: story_with_two_equal_or_statements
steps:
- intent: simple
- action: utter_default
- or:
- intent: affirm
- intent: thank_you
- action: utter_greet
- action: utter_default
- or:
- intent: affirm
- intent: thank_you
- action: utter_goodbye
# same name again, same events, same intents, but with entities
- story: story_with_two_equal_or_statements
steps:
- intent: simple
- action: utter_default
- or:
- intent: affirm
entities:
- name: peter
- intent: thank_you
- action: utter_greet
2 changes: 1 addition & 1 deletion rasa/shared/core/training_data/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _guess_reader(


async def load_data_from_resource(
resource: Union[Text, Path],
resource: Union[Text],
domain: Domain,
template_variables: Optional[Dict] = None,
use_e2e: bool = False,
Expand Down
37 changes: 34 additions & 3 deletions rasa/shared/core/training_data/story_reader/story_step_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,8 @@ def add_user_messages(
# user can use the express the same thing
# we need to copy the blocks and create one
# copy for each possible message
prefix = GENERATED_CHECKPOINT_PREFIX + "OR_"
generated_checkpoint = rasa.shared.core.training_data.structures.generate_id(
prefix, GENERATED_HASH_LENGTH
generated_checkpoint = self._generate_checkpoint_name_for_or_statement(
messages
)
updated_steps = []
for t in self.current_steps:
Expand Down Expand Up @@ -143,3 +142,35 @@ def _next_story_steps(self) -> List[StoryStep]:
)
]
return current_turns

def _generate_checkpoint_name_for_or_statement(
self, messages: List[UserUttered]
) -> str:
"""Generates a unique checkpoint name for an or statement.

The name is based on the current story/rule name,
the current place in the story since the last checkpoint or start,
the name of the starting checkpoints,
and the involved intents/e2e messages.
"""
messages_texts_or_intents = sorted([str(m) for m in messages])
start_checkpoint_names = sorted(
list({chk.name for s in self.current_steps for chk in s.start_checkpoints})
)
events = [str(e) for s in self.current_steps for e in s.events]
# name: to identify the current story or rule
# events: to identify what has happened so far
# within the current story/rule
# start checkpoint names: to identify the section
# within the current story/rule when there are
# multiple internal checkpoints
# messages texts or intents: identifying the members of the or statement
unique_id = (
f"{self.name}_<>_{'@@@'.join(events)}"
f"_<>_{'@@@'.join(start_checkpoint_names)}"
f"_<>_{'@@@'.join(messages_texts_or_intents)}"
)
hashed_id = rasa.shared.utils.io.get_text_hash(unique_id)[
:GENERATED_HASH_LENGTH
]
return f"{GENERATED_CHECKPOINT_PREFIX}OR_{hashed_id}"
4 changes: 2 additions & 2 deletions rasa/shared/core/training_data/structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,8 +411,8 @@ def fingerprint(self) -> Text:
YAMLStoryWriter,
)

self_as_string = YAMLStoryWriter().dumps(self.story_steps)
return rasa.shared.utils.io.get_text_hash(self_as_string)
stories_as_yaml = YAMLStoryWriter().stories_to_yaml(self.story_steps)
return rasa.shared.utils.io.deep_container_fingerprint(stories_as_yaml)

def ordered_steps(self) -> List[StoryStep]:
"""Returns the story steps ordered by topological order of the DAG."""
Expand Down
42 changes: 42 additions & 0 deletions tests/shared/core/training_data/test_graph.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from rasa.shared.core.training_data.structures import StoryGraph
import rasa.shared.core.training_data.loading
from rasa.shared.core.domain import Domain


def check_graph_is_sorted(g, sorted_nodes, removed_edges):
Expand Down Expand Up @@ -45,3 +47,43 @@ def test_node_ordering_with_cycle():

def test_is_empty():
assert StoryGraph([]).is_empty()


async def test_consistent_fingerprints():
twerkmeister marked this conversation as resolved.
Show resolved Hide resolved
stories_path = "data/test_yaml_stories/stories.yml"
domain_path = "data/test_domains/default_with_slots.yml"
domain = Domain.load(domain_path)
story_steps = await rasa.shared.core.training_data.loading.load_data_from_resource(
stories_path, domain
)
story_graph = StoryGraph(story_steps)

# read again
story_steps_2 = await rasa.shared.core.training_data.loading.load_data_from_resource(
stories_path, domain
)
story_graph_2 = StoryGraph(story_steps_2)

fingerprint = story_graph.fingerprint()
fingerprint_2 = story_graph_2.fingerprint()

assert fingerprint == fingerprint_2


async def test_unique_checkpoint_names():
stories_path = "data/test_yaml_stories/story_with_two_equal_or_statements.yml"
domain_path = "data/test_domains/default_with_slots.yml"
domain = Domain.load(domain_path)
story_steps = await rasa.shared.core.training_data.loading.load_data_from_resource(
stories_path, domain
)
start_checkpoint_names = {
chk.name for s in story_steps for chk in s.start_checkpoints
}

# first story:
# START_CHECKPOINT, GENR_OR_XXXXX for first OR, GENR_OR_YYYYY for second OR

# additional in second story:
# GENR_OR_ZZZZZ as entities are different from first OR in first story
assert len(start_checkpoint_names) == 4