Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed changing fingerprints even though training data did not change #7246

Merged
merged 7 commits into from
Nov 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/7246.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed model fingerprinting - it should avoid some more unecessary retrainings now.
16 changes: 7 additions & 9 deletions rasa/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,18 +269,16 @@ def get_file_hash(path: Text) -> Text:
return md5(file_as_bytes(path)).hexdigest()


def get_dict_hash(
data: Dict, encoding: Text = rasa.shared.utils.io.DEFAULT_ENCODING
) -> Text:
"""Calculate the md5 hash of a dictionary."""
return md5(json.dumps(data, sort_keys=True).encode(encoding)).hexdigest()


async def download_file_from_url(url: Text) -> Text:
"""Download a story file from a url and persists it into a temp file.

Returns the file path of the temp file that contains the
downloaded content."""
Args:
url: url to download from

Returns:
The file path of the temp file that contains the
downloaded content.
"""
from rasa.nlu import utils as nlu_utils

if not nlu_utils.is_url(url):
Expand Down
25 changes: 13 additions & 12 deletions rasa/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import tempfile
import typing
from pathlib import Path
from typing import Text, Tuple, Union, Optional, List, Dict, NamedTuple
from typing import Any, Text, Tuple, Union, Optional, List, Dict, NamedTuple

import rasa.shared.utils.io
import rasa.utils.io
Expand All @@ -24,7 +24,6 @@
DEFAULT_NLU_SUBDIRECTORY_NAME,
)

from rasa.core.utils import get_dict_hash
from rasa.exceptions import ModelNotFound
from rasa.utils.common import TempDirectoryPath

Expand Down Expand Up @@ -319,25 +318,27 @@ async def model_fingerprint(file_importer: "TrainingDataImporter") -> Fingerprin
domain.templates = []

return {
FINGERPRINT_CONFIG_KEY: _get_hash_of_config(config, exclude_keys=CONFIG_KEYS),
FINGERPRINT_CONFIG_CORE_KEY: _get_hash_of_config(
FINGERPRINT_CONFIG_KEY: _get_fingerprint_of_config(
config, exclude_keys=CONFIG_KEYS
),
FINGERPRINT_CONFIG_CORE_KEY: _get_fingerprint_of_config(
config, include_keys=CONFIG_KEYS_CORE
),
FINGERPRINT_CONFIG_NLU_KEY: _get_hash_of_config(
FINGERPRINT_CONFIG_NLU_KEY: _get_fingerprint_of_config(
config, include_keys=CONFIG_KEYS_NLU
),
FINGERPRINT_DOMAIN_WITHOUT_NLG_KEY: hash(domain),
FINGERPRINT_NLG_KEY: get_dict_hash(responses),
FINGERPRINT_DOMAIN_WITHOUT_NLG_KEY: domain.fingerprint(),
FINGERPRINT_NLG_KEY: rasa.shared.utils.io.deep_container_fingerprint(responses),
FINGERPRINT_PROJECT: project_fingerprint(),
FINGERPRINT_NLU_DATA_KEY: hash(nlu_data),
FINGERPRINT_STORIES_KEY: hash(stories),
FINGERPRINT_NLU_DATA_KEY: nlu_data.fingerprint(),
FINGERPRINT_STORIES_KEY: stories.fingerprint(),
FINGERPRINT_TRAINED_AT_KEY: time.time(),
FINGERPRINT_RASA_VERSION_KEY: rasa.__version__,
}


def _get_hash_of_config(
config: Optional[Dict],
def _get_fingerprint_of_config(
config: Optional[Dict[Text, Any]],
include_keys: Optional[List[Text]] = None,
exclude_keys: Optional[List[Text]] = None,
) -> Text:
Expand All @@ -348,7 +349,7 @@ def _get_hash_of_config(

sub_config = {k: config[k] for k in keys if k in config}

return get_dict_hash(sub_config)
return rasa.shared.utils.io.deep_container_fingerprint(sub_config)


def fingerprint_from_path(model_path: Text) -> Fingerprint:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def cache_key(
Returns: key of the cache for future retrievals.
"""
_config = common.update_existing_keys(cls.defaults, component_meta)
return f"{cls.name}-{rasa.core.utils.get_dict_hash(_config)}"
return f"{cls.name}-{rasa.shared.utils.io.deep_container_fingerprint(_config)}"

def provide_context(self) -> Dict[Text, Any]:
"""Store the model in pipeline context for future use."""
Expand Down
3 changes: 2 additions & 1 deletion rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from rasa.nlu.components import Component, UnsupportedLanguageError
from rasa.nlu.featurizers.featurizer import DenseFeaturizer
from rasa.nlu.model import Metadata
import rasa.shared.utils.io
from rasa.shared.nlu.training_data.features import Features
from rasa.nlu.tokenizers.tokenizer import Tokenizer, Token
from rasa.shared.nlu.training_data.training_data import TrainingData
Expand Down Expand Up @@ -222,7 +223,7 @@ def cache_key(

return (
f"{cls.name}-{component_meta.get('model_name')}-"
f"{rasa.core.utils.get_dict_hash(weights)}"
f"{rasa.shared.utils.io.deep_container_fingerprint(weights)}"
)

@classmethod
Expand Down
3 changes: 2 additions & 1 deletion rasa/nlu/utils/hugging_face/hf_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from rasa.nlu.config import RasaNLUModelConfig
from rasa.shared.nlu.training_data.training_data import TrainingData
from rasa.shared.nlu.training_data.message import Message
import rasa.shared.utils.io
from rasa.nlu.tokenizers.tokenizer import Token
import rasa.shared.utils.io
import rasa.utils.train_utils as train_utils
Expand Down Expand Up @@ -148,7 +149,7 @@ def cache_key(

return (
f"{cls.name}-{component_meta.get('model_name')}-"
f"{rasa.core.utils.get_dict_hash(weights)}"
f"{rasa.shared.utils.io.deep_container_fingerprint(weights)}"
)

@classmethod
Expand Down
13 changes: 9 additions & 4 deletions rasa/shared/core/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,17 +627,22 @@ def _initialize_forms(

def __hash__(self) -> int:
"""Returns a unique hash for the domain."""
return int(self.fingerprint(), 16)

def fingerprint(self) -> Text:
"""Returns a unique hash for the domain which is stable across python runs.

Returns:
fingerprint of the domain
"""
self_as_dict = self.as_dict()
self_as_dict[
KEY_INTENTS
] = rasa.shared.utils.common.sort_list_of_dicts_by_first_key(
self_as_dict[KEY_INTENTS]
)
self_as_dict[KEY_ACTIONS] = self.action_names
self_as_string = json.dumps(self_as_dict, sort_keys=True)
text_hash = rasa.shared.utils.io.get_text_hash(self_as_string)

return int(text_hash, 16)
return rasa.shared.utils.io.get_dictionary_fingerprint(self_as_dict)

@rasa.shared.utils.common.lazy_property
def user_actions_and_forms(self):
Expand Down
18 changes: 14 additions & 4 deletions rasa/shared/core/training_data/structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,14 +394,24 @@ def __init__(
self.story_end_checkpoints = {}

def __hash__(self) -> int:
self_as_string = self.as_story_string()
text_hash = rasa.shared.utils.io.get_text_hash(self_as_string)
"""Return hash for the story step.

Returns:
Hash of the story step.
"""
return int(self.fingerprint(), 16)

return int(text_hash, 16)
def fingerprint(self) -> Text:
"""Returns a unique hash for the stories which is stable across python runs.

Returns:
fingerprint of the stories
"""
self_as_string = self.as_story_string()
return rasa.shared.utils.io.get_text_hash(self_as_string)

def ordered_steps(self) -> List[StoryStep]:
"""Returns the story steps ordered by topological order of the DAG."""

return [self.get(step_id) for step_id in self.ordered_ids]

def cyclic_edges(self) -> List[Tuple[Optional[StoryStep], Optional[StoryStep]]]:
Expand Down
27 changes: 16 additions & 11 deletions rasa/shared/nlu/training_data/message.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from typing import Any, Optional, Tuple, Text, Dict, Set, List

import typing
Expand Down Expand Up @@ -86,10 +87,23 @@ def __eq__(self, other) -> bool:
if not isinstance(other, Message):
return False
else:
return ordered(other.data) == ordered(self.data)
return other.fingerprint() == self.fingerprint()

def __hash__(self) -> int:
return hash(str(ordered(self.data)))
"""Calculate a hash for the message.

Returns:
Hash of the message.
"""
return int(self.fingerprint(), 16)

def fingerprint(self) -> Text:
"""Calculate a string fingerprint for the message.

Returns:
Fingerprint of the message.
"""
return rasa.shared.utils.io.deep_container_fingerprint(self.data)

@classmethod
def build(
Expand Down Expand Up @@ -322,12 +336,3 @@ def is_core_message(self) -> bool:
and not (self.data.get(INTENT) or self.data.get(RESPONSE))
)
)


def ordered(obj: Any) -> Any:
if isinstance(obj, dict):
return sorted((k, ordered(v)) for k, v in obj.items())
if isinstance(obj, list):
return sorted(ordered(x) for x in obj)
else:
return obj
41 changes: 35 additions & 6 deletions rasa/shared/nlu/training_data/training_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,44 @@ def __init__(

self._fill_response_phrases()

def merge(self, *others: "TrainingData") -> "TrainingData":
"""Return merged instance of this data with other training data."""
def fingerprint(self) -> Text:
"""Fingerprint the training data.

Returns:
hex string as a fingerprint of the training data.
"""
relevant_attributes = {
"training_examples": list(
sorted(e.fingerprint() for e in self.training_examples)
),
"entity_synonyms": self.entity_synonyms,
"regex_features": self.regex_features,
"lookup_tables": self.lookup_tables,
"responses": self.responses,
}

return rasa.shared.utils.io.deep_container_fingerprint(relevant_attributes)

def merge(self, *others: Optional["TrainingData"]) -> "TrainingData":
"""Return merged instance of this data with other training data.

Args:
others: other training data instances to merge this one with

Returns:
Merged training data object. Merging is not done in place, this
will be a new instance.
"""
training_examples = copy.deepcopy(self.training_examples)
entity_synonyms = self.entity_synonyms.copy()
regex_features = copy.deepcopy(self.regex_features)
lookup_tables = copy.deepcopy(self.lookup_tables)
responses = copy.deepcopy(self.responses)
others = [other for other in others if other]

for o in others:
if not o:
continue

training_examples.extend(copy.deepcopy(o.training_examples))
regex_features.extend(copy.deepcopy(o.regex_features))
lookup_tables.extend(copy.deepcopy(o.lookup_tables))
Expand Down Expand Up @@ -109,10 +136,12 @@ def filter_training_examples(
)

def __hash__(self) -> int:
stringified = self.nlu_as_json() + self.nlg_as_markdown()
text_hash = rasa.shared.utils.io.get_text_hash(stringified)
"""Calculate hash for the training data object.

return int(text_hash, 16)
Returns:
Hash of the training data object.
"""
return int(self.fingerprint(), 16)

@staticmethod
def sanitize_examples(examples: List[Message]) -> List[Message]:
Expand Down
69 changes: 69 additions & 0 deletions rasa/shared/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,75 @@ def list_subdirectories(path: Text) -> List[Text]:
return [fn for fn in glob.glob(os.path.join(path, "*")) if os.path.isdir(fn)]


def deep_container_fingerprint(
obj: Union[List[Any], Dict[Any, Any]], encoding: Text = DEFAULT_ENCODING
) -> Text:
"""Calculate a hash which is stable, independent of a containers key order.

Works for lists and dictionaries. For keys and values, we recursively call
`hash(...)` on them. Keep in mind that a list with keys in a different order
will create the same hash!

Args:
obj: dictionary or list to be hashed.
encoding: encoding used for dumping objects as strings

Returns:
hash of the container.
"""
if isinstance(obj, dict):
return get_dictionary_fingerprint(obj, encoding)
if isinstance(obj, list):
return get_list_fingerprint(obj, encoding)
else:
return get_text_hash(str(obj), encoding)


def get_dictionary_fingerprint(
dictionary: Dict[Any, Any], encoding: Text = DEFAULT_ENCODING
) -> Text:
"""Calculate the fingerprint for a dictionary.

The dictionary can contain any keys and values which are either a dict,
a list or a elements which can be dumped as a string.

Args:
dictionary: dictionary to be hashed
encoding: encoding used for dumping objects as strings

Returns:
The hash of the dictionary
"""
stringified = json.dumps(
{
deep_container_fingerprint(k, encoding): deep_container_fingerprint(
v, encoding
)
for k, v in dictionary.items()
},
sort_keys=True,
)
return get_text_hash(stringified, encoding)


def get_list_fingerprint(
elements: List[Any], encoding: Text = DEFAULT_ENCODING
) -> Text:
"""Calculate a fingerprint for an unordered list.

Args:
elements: unordered list
encoding: encoding used for dumping objects as strings

Returns:
the fingerprint of the list
"""
stringified = json.dumps(
[deep_container_fingerprint(element, encoding) for element in elements]
)
return get_text_hash(stringified, encoding)


def get_text_hash(text: Text, encoding: Text = DEFAULT_ENCODING) -> Text:
"""Calculate the md5 hash for a text."""
return md5(text.encode(encoding)).hexdigest()
Expand Down
9 changes: 0 additions & 9 deletions tests/shared/nlu/training_data/test_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,15 +251,6 @@ def test_features_present(
assert actual == expected


def test_ordered():
target = {"a": [1, 3, 2], "c": "a", "b": 1}
assert rasa.shared.nlu.training_data.message.ordered(target) == [
("a", [1, 2, 3]),
("b", 1),
("c", "a"),
]


@pytest.mark.parametrize(
"message, core_message",
[
Expand Down
Loading