From 136628df63e98733b41cbfb782a12fe7ff9a35aa Mon Sep 17 00:00:00 2001 From: Daksh Date: Mon, 30 Nov 2020 20:21:33 +0100 Subject: [PATCH 01/45] doc strings and changes needed to cvf --- .../count_vectors_featurizer.py | 272 ++++++++++++++---- 1 file changed, 218 insertions(+), 54 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 29978e3018a9..d4daec8669bf 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -2,7 +2,7 @@ import os import re import scipy.sparse -from typing import Any, Dict, List, Optional, Text, Type, Tuple +from typing import Any, Dict, List, Optional, Text, Type, Tuple, Set import rasa.shared.utils.io from rasa.shared.constants import DOCS_URL_COMPONENTS @@ -26,11 +26,15 @@ TEXT, INTENT, INTENT_RESPONSE_KEY, + RESPONSE, + ACTION_TEXT, FEATURE_TYPE_SENTENCE, FEATURE_TYPE_SEQUENCE, ACTION_NAME, ) +BUFFER_SLOTS_PREFIX = "buf_" + logger = logging.getLogger(__name__) @@ -84,6 +88,8 @@ def required_components(cls) -> List[Type[Component]]: # indicates whether the featurizer should use the lemma of a word for # counting (if available) or not "use_lemma": True, + # Additional vocabulary size to be kept reserved + "additional_vocabulary_size": {TEXT: None, RESPONSE: None, ACTION_TEXT: None}, } @classmethod @@ -125,7 +131,7 @@ def _load_count_vect_params(self) -> None: self.use_lemma = self.component_config["use_lemma"] # noinspection PyPep8Naming - def _load_OOV_params(self) -> None: + def _load_vocabulary_params(self) -> None: self.OOV_token = self.component_config["OOV_token"] self.OOV_words = self.component_config["OOV_words"] @@ -143,6 +149,11 @@ def _load_OOV_params(self) -> None: if self.OOV_words: self.OOV_words = [w.lower() for w in self.OOV_words] + # Additional vocabulary size to be kept reserved + self.additional_vocabulary_size = self.component_config[ + "additional_vocabulary_size" + ] + def _check_attribute_vocabulary(self, attribute: Text) -> bool: """Check if trained vocabulary exists in attribute's count vectorizer""" try: @@ -200,6 +211,7 @@ def __init__( self, component_config: Optional[Dict[Text, Any]] = None, vectorizers: Optional[Dict[Text, "CountVectorizer"]] = None, + finetune_mode: bool = False, ) -> None: """Construct a new count vectorizer using the sklearn framework.""" @@ -209,7 +221,7 @@ def __init__( self._load_count_vect_params() # handling Out-Of-Vocabulary (OOV) words - self._load_OOV_params() + self._load_vocabulary_params() # warn that some of config parameters might be ignored self._check_analyzer() @@ -220,6 +232,8 @@ def __init__( # declare class instance for CountVectorizer self.vectorizers = vectorizers + self.finetune_mode = finetune_mode + def _get_message_tokens_by_attribute( self, message: "Message", attribute: Text ) -> List[Text]: @@ -341,71 +355,219 @@ def _convert_attribute_tokens_to_texts( return attribute_texts - def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]): - """Construct the vectorizers and train them with a shared vocab""" + @staticmethod + def _get_starting_empty_index(vocabulary: Dict[Text, int]) -> int: + for key in vocabulary.keys(): + if key.startswith(BUFFER_SLOTS_PREFIX): + return int(key.split(BUFFER_SLOTS_PREFIX)[1]) + return len(vocabulary) + + def _update_vectorizer_vocabulary( + self, attribute: Text, vocabulary: Set[Text] + ) -> None: + """Update the existing vocabulary of the vectorizer with new unseen words. - self.vectorizers = self._create_shared_vocab_vectorizers( - { - "strip_accents": self.strip_accents, - "lowercase": self.lowercase, - "stop_words": self.stop_words, - "min_ngram": self.min_ngram, - "max_ngram": self.max_ngram, - "max_df": self.max_df, - "min_df": self.min_df, - "max_features": self.max_features, - "analyzer": self.analyzer, - } - ) + These unseen words should only occupy the empty buffer slots. - combined_cleaned_texts = [] - for attribute in self._attributes: - combined_cleaned_texts += attribute_texts[attribute] + Args: + attribute: Message attribute for which vocabulary should be updated. + vocabulary: Set of words to expand the vocabulary with if they are unseen. - try: - self.vectorizers[TEXT].fit(combined_cleaned_texts) - except ValueError: + """ + existing_vocabulary: Dict[Text, int] = self.vectorizers[attribute].vocabulary + available_empty_index = self._get_starting_empty_index(existing_vocabulary) + # print( + # attribute, len(existing_vocabulary), available_empty_index, len(vocabulary) + # ) + if len(vocabulary) > len(existing_vocabulary): logger.warning( - "Unable to train a shared CountVectorizer. " - "Leaving an untrained CountVectorizer" + f"New data contains vocabulary of size {len(vocabulary)} for attribute {attribute} " + f"which is larger than the maximum vocabulary size({len(existing_vocabulary)}) " + f"of the original model. Some tokens will have to be dropped " + f"in order to continue training. It is advised to re-train the " + f"model from scratch on complete data." ) + for token in vocabulary: + if token not in existing_vocabulary: + existing_vocabulary[token] = available_empty_index + del existing_vocabulary[f"{BUFFER_SLOTS_PREFIX}{available_empty_index}"] + available_empty_index += 1 + if available_empty_index == len(existing_vocabulary): + break + self._set_vocabulary(attribute, existing_vocabulary) + + def _add_buffer_to_vocabulary(self, attribute: Text) -> None: + """Add extra tokens to vocabulary for incremental training. + + These extra tokens act as buffer slots which are used up sequentially + when more data is received as part of incremental training. Each of + these tokens start with a prefix `buf_` followed by the extra slot index. + So for example - buf_1, buf_2, buf_3... and so on. + + Args: + attribute: Name of the attribute for which the vocabulary should be expanded. + + """ + original_vocabulary = self.vectorizers[attribute].vocabulary_ + current_vocabulary_size = self._get_starting_empty_index(original_vocabulary) + for index in range( + current_vocabulary_size, + current_vocabulary_size + self.additional_vocabulary_size[attribute], + ): + original_vocabulary[f"{BUFFER_SLOTS_PREFIX}{index}"] = index + self._set_vocabulary(attribute, original_vocabulary) + + def _set_vocabulary( + self, attribute: Text, original_vocabulary: Dict[Text, int] + ) -> None: + """Set the vocabulary of the vectorizer of attribute + + Args: + attribute: Message attribute for which vocabulary should be set + original_vocabulary: Vocabulary for the attribute to be set. + + """ + self.vectorizers[attribute].vocabulary_ = original_vocabulary + self.vectorizers[attribute]._validate_vocabulary() + + @staticmethod + def _construct_vocabulary_from_texts( + vectorizer: CountVectorizer, texts: List[Text] + ) -> Set: + """Apply vectorizer's preprocessor on texts to get the vocabulary from texts. + + Args: + vectorizer: Sklearn's count vectorizer which has been pre-configured. + texts: Examples from which the vocabulary should be constructed + + Returns: + Set of unique vocabulary words extracted. + """ + analyzer = vectorizer.build_analyzer() + vocabulary_words = set() + for example in texts: + example_vocabulary: List[Text] = analyzer(example) + vocabulary_words.update(example_vocabulary) + return vocabulary_words @staticmethod def _attribute_texts_is_non_empty(attribute_texts: List[Text]) -> bool: return any(attribute_texts) + def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]): + """Construct the vectorizers and train them with a shared vocab""" + combined_cleaned_texts = [] + for attribute in self._attributes: + combined_cleaned_texts += attribute_texts[attribute] + + if not self.finetune_mode: + self.vectorizers = self._create_shared_vocab_vectorizers( + { + "strip_accents": self.strip_accents, + "lowercase": self.lowercase, + "stop_words": self.stop_words, + "min_ngram": self.min_ngram, + "max_ngram": self.max_ngram, + "max_df": self.max_df, + "min_df": self.min_df, + "max_features": self.max_features, + "analyzer": self.analyzer, + } + ) + + self._fit_vectorizer_from_scratch(TEXT, combined_cleaned_texts) + + else: + self._fit_loaded_vectorizer(TEXT, combined_cleaned_texts) + def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]): """Construct the vectorizers and train them with an independent vocab""" - self.vectorizers = self._create_independent_vocab_vectorizers( - { - "strip_accents": self.strip_accents, - "lowercase": self.lowercase, - "stop_words": self.stop_words, - "min_ngram": self.min_ngram, - "max_ngram": self.max_ngram, - "max_df": self.max_df, - "min_df": self.min_df, - "max_features": self.max_features, - "analyzer": self.analyzer, - } - ) + if not self.finetune_mode: + self.vectorizers = self._create_independent_vocab_vectorizers( + { + "strip_accents": self.strip_accents, + "lowercase": self.lowercase, + "stop_words": self.stop_words, + "min_ngram": self.min_ngram, + "max_ngram": self.max_ngram, + "max_df": self.max_df, + "min_df": self.min_df, + "max_features": self.max_features, + "analyzer": self.analyzer, + } + ) - for attribute in self._attributes: - if self._attribute_texts_is_non_empty(attribute_texts[attribute]): - try: - self.vectorizers[attribute].fit(attribute_texts[attribute]) - except ValueError: - logger.warning( - f"Unable to train CountVectorizer for message " - f"attribute {attribute}. Leaving an untrained " - f"CountVectorizer for it." + for attribute in self._attributes: + if self._attribute_texts_is_non_empty(attribute_texts[attribute]): + self._fit_vectorizer_from_scratch( + attribute, attribute_texts[attribute] ) - else: - logger.debug( - f"No text provided for {attribute} attribute in any messages of " - f"training data. Skipping training a CountVectorizer for it." - ) + else: + logger.debug( + f"No text provided for {attribute} attribute in any messages of " + f"training data. Skipping training a CountVectorizer for it." + ) + else: + for attribute in self._attributes: + if self._attribute_texts_is_non_empty(attribute_texts[attribute]): + try: + self._fit_loaded_vectorizer( + attribute, attribute_texts[attribute] + ) + + except ValueError: + logger.warning( + f"Unable to train CountVectorizer for message " + f"attribute {attribute}. Leaving an untrained " + f"CountVectorizer for it." + ) + else: + logger.debug( + f"No text provided for {attribute} attribute in any messages of " + f"training data. Skipping training a CountVectorizer for it." + ) + + def _fit_loaded_vectorizer( + self, attribute: Text, attribute_texts: List[Text] + ) -> None: + """Fit training texts to a previously trained count vectorizer. + + We do not use the `.fit()` method because the new unseen + words should occupy the buffer slots of the vocabulary. + + Args: + attribute: Message attribute for which the vectorizer is to be trained. + attribute_texts: Training texts for the attribute + + """ + # Get vocabulary words by the preprocessor + new_vocabulary = self._construct_vocabulary_from_texts( + self.vectorizers[attribute], attribute_texts + ) + # update the vocabulary of vectorizer with new vocabulary + self._update_vectorizer_vocabulary(attribute, new_vocabulary) + + def _fit_vectorizer_from_scratch( + self, attribute: Text, attribute_texts: List[Text] + ) -> None: + """Fit training texts to an untrained count vectorizer. + + Args: + attribute: Message attribute for which the vectorizer is to be trained. + attribute_texts: Training texts for the attribute + + """ + try: + self.vectorizers[attribute].fit(attribute_texts) + # Add buffer for extra vocabulary tokens in future + self._add_buffer_to_vocabulary(attribute) + except ValueError: + logger.warning( + f"Unable to train CountVectorizer for message " + f"attribute {attribute}. Leaving an untrained " + f"CountVectorizer for it." + ) def _create_features( self, attribute: Text, all_tokens: List[List[Text]] @@ -674,6 +836,8 @@ def load( **kwargs: Any, ) -> "CountVectorsFeaturizer": + finetune_mode = kwargs.pop("finetune_mode") + file_name = meta.get("file") featurizer_file = os.path.join(model_dir, file_name) @@ -693,7 +857,7 @@ def load( meta, vocabulary=vocabulary ) - ftr = cls(meta, vectorizers) + ftr = cls(meta, vectorizers, finetune_mode) # make sure the vocabulary has been loaded correctly for attribute in vectorizers: From b7091324f9215e94e289b681cbc8cf81b04686fc Mon Sep 17 00:00:00 2001 From: Daksh Date: Tue, 1 Dec 2020 23:09:31 +0100 Subject: [PATCH 02/45] added tests, small refactoring in cvf --- .../count_vectors_featurizer.py | 85 +++--- .../test_count_vectors_featurizer.py | 267 +++++++++++++++++- 2 files changed, 302 insertions(+), 50 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index d4daec8669bf..d0b675d0abcb 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -153,6 +153,14 @@ def _load_vocabulary_params(self) -> None: self.additional_vocabulary_size = self.component_config[ "additional_vocabulary_size" ] + if isinstance(self.additional_vocabulary_size, int): + # User defined a common value for all attributes + user_defined_additional_size = self.additional_vocabulary_size + self.additional_vocabulary_size = {} + for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: + self.additional_vocabulary_size[ + attribute + ] = user_defined_additional_size def _check_attribute_vocabulary(self, attribute: Text) -> bool: """Check if trained vocabulary exists in attribute's count vectorizer""" @@ -376,9 +384,6 @@ def _update_vectorizer_vocabulary( """ existing_vocabulary: Dict[Text, int] = self.vectorizers[attribute].vocabulary available_empty_index = self._get_starting_empty_index(existing_vocabulary) - # print( - # attribute, len(existing_vocabulary), available_empty_index, len(vocabulary) - # ) if len(vocabulary) > len(existing_vocabulary): logger.warning( f"New data contains vocabulary of size {len(vocabulary)} for attribute {attribute} " @@ -396,6 +401,32 @@ def _update_vectorizer_vocabulary( break self._set_vocabulary(attribute, existing_vocabulary) + def _get_additional_vocabulary_size( + self, attribute: Text, existing_vocabulary_size: int + ) -> int: + """Get additional vocabulary size to be saved for incremental training. + + If `self.additional_vocabulary_size` is not None, + we return that as the user should have specified + this number. If not then we take the default + additional vocabulary size which is 1/2 of the + current vocabulary size. + Args: + attribute: Message attribute for which additional vocabulary size should be computed. + existing_vocabulary_size: Current size of vocabulary learnt from the training data. + + Returns: + Size of additional vocabulary that should be set aside for incremental training. + """ + + # Vocabulary expansion for INTENTS, ACTION_NAME + # and INTENT_RESPONSE_KEY is currently not supported. + if attribute not in DENSE_FEATURIZABLE_ATTRIBUTES: + return 0 + if self.additional_vocabulary_size.get(attribute, None) is not None: + return self.additional_vocabulary_size[attribute] + return int(existing_vocabulary_size * 0.5) + def _add_buffer_to_vocabulary(self, attribute: Text) -> None: """Add extra tokens to vocabulary for incremental training. @@ -412,7 +443,8 @@ def _add_buffer_to_vocabulary(self, attribute: Text) -> None: current_vocabulary_size = self._get_starting_empty_index(original_vocabulary) for index in range( current_vocabulary_size, - current_vocabulary_size + self.additional_vocabulary_size[attribute], + current_vocabulary_size + + self._get_additional_vocabulary_size(attribute, current_vocabulary_size), ): original_vocabulary[f"{BUFFER_SLOTS_PREFIX}{index}"] = index self._set_vocabulary(attribute, original_vocabulary) @@ -474,9 +506,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]): "analyzer": self.analyzer, } ) - self._fit_vectorizer_from_scratch(TEXT, combined_cleaned_texts) - else: self._fit_loaded_vectorizer(TEXT, combined_cleaned_texts) @@ -497,36 +527,19 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]) "analyzer": self.analyzer, } ) - - for attribute in self._attributes: - if self._attribute_texts_is_non_empty(attribute_texts[attribute]): + for attribute in self._attributes: + if self._attribute_texts_is_non_empty(attribute_texts[attribute]): + if not self.finetune_mode: self._fit_vectorizer_from_scratch( attribute, attribute_texts[attribute] ) else: - logger.debug( - f"No text provided for {attribute} attribute in any messages of " - f"training data. Skipping training a CountVectorizer for it." - ) - else: - for attribute in self._attributes: - if self._attribute_texts_is_non_empty(attribute_texts[attribute]): - try: - self._fit_loaded_vectorizer( - attribute, attribute_texts[attribute] - ) - - except ValueError: - logger.warning( - f"Unable to train CountVectorizer for message " - f"attribute {attribute}. Leaving an untrained " - f"CountVectorizer for it." - ) - else: - logger.debug( - f"No text provided for {attribute} attribute in any messages of " - f"training data. Skipping training a CountVectorizer for it." - ) + self._fit_loaded_vectorizer(attribute, attribute_texts[attribute]) + else: + logger.debug( + f"No text provided for {attribute} attribute in any messages of " + f"training data. Skipping training a CountVectorizer for it." + ) def _fit_loaded_vectorizer( self, attribute: Text, attribute_texts: List[Text] @@ -560,14 +573,16 @@ def _fit_vectorizer_from_scratch( """ try: self.vectorizers[attribute].fit(attribute_texts) - # Add buffer for extra vocabulary tokens in future - self._add_buffer_to_vocabulary(attribute) except ValueError: logger.warning( f"Unable to train CountVectorizer for message " - f"attribute {attribute}. Leaving an untrained " + f"attribute {attribute} since the call to sklearn's " + f"`.fit()` method failed. Leaving an untrained " f"CountVectorizer for it." ) + # Add buffer for extra vocabulary tokens + # that come in during incremental training. + self._add_buffer_to_vocabulary(attribute) def _create_features( self, attribute: Text, all_tokens: List[List[Text]] diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index a31cc054489f..62c911cc4cdf 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -1,9 +1,10 @@ -from typing import List, Any - +from typing import List, Any, Text, Optional import numpy as np import pytest import scipy.sparse -from typing import Text +from pathlib import Path +from _pytest.logging import LogCaptureFixture +import logging from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer from rasa.nlu.config import RasaNLUModelConfig @@ -28,7 +29,7 @@ ], ) def test_count_vector_featurizer(sentence, expected, expected_cls): - ftr = CountVectorsFeaturizer() + ftr = CountVectorsFeaturizer({"additional_vocabulary_size": {"text": 0}}) train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) @@ -63,7 +64,9 @@ def test_count_vector_featurizer(sentence, expected, expected_cls): def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features ): - ftr = CountVectorsFeaturizer() + ftr = CountVectorsFeaturizer( + {"additional_vocabulary_size": {"text": 0, "response": 0}} + ) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) @@ -121,7 +124,9 @@ def test_count_vector_featurizer_response_attribute_featurization( def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features ): - ftr = CountVectorsFeaturizer() + ftr = CountVectorsFeaturizer( + {"additional_vocabulary_size": {"text": 0, "response": 0}} + ) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) @@ -178,7 +183,12 @@ def test_count_vector_featurizer_attribute_featurization( def test_count_vector_featurizer_shared_vocab( sentence, intent, response, text_features, intent_features, response_features ): - ftr = CountVectorsFeaturizer({"use_shared_vocab": True}) + ftr = CountVectorsFeaturizer( + { + "use_shared_vocab": True, + "additional_vocabulary_size": {"text": 0, "response": 0}, + } + ) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) @@ -223,7 +233,9 @@ def test_count_vector_featurizer_shared_vocab( ], ) def test_count_vector_featurizer_oov_token(sentence, expected): - ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"}) + ftr = CountVectorsFeaturizer( + {"OOV_token": "__oov__", "additional_vocabulary_size": {"text": 0}} + ) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) @@ -254,7 +266,11 @@ def test_count_vector_featurizer_oov_token(sentence, expected): def test_count_vector_featurizer_oov_words(sentence, expected): ftr = CountVectorsFeaturizer( - {"OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"]} + { + "OOV_token": "__oov__", + "OOV_words": ["oov_word0", "OOV_word1"], + "additional_vocabulary_size": {"text": 0}, + } ) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) @@ -288,7 +304,7 @@ def test_count_vector_featurizer_oov_words(sentence, expected): ) def test_count_vector_featurizer_using_tokens(tokens, expected): - ftr = CountVectorsFeaturizer() + ftr = CountVectorsFeaturizer({"additional_vocabulary_size": {"text": 0}}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. @@ -326,7 +342,14 @@ def test_count_vector_featurizer_using_tokens(tokens, expected): ], ) def test_count_vector_featurizer_char(sentence, expected): - ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"}) + ftr = CountVectorsFeaturizer( + { + "min_ngram": 1, + "max_ngram": 2, + "analyzer": "char", + "additional_vocabulary_size": {"text": 0}, + } + ) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) @@ -347,7 +370,7 @@ def test_count_vector_featurizer_char(sentence, expected): assert sen_vec is not None -def test_count_vector_featurizer_persist_load(tmp_path): +def test_count_vector_featurizer_persist_load(tmp_path: Path): # set non default values to config config = { @@ -360,6 +383,7 @@ def test_count_vector_featurizer_persist_load(tmp_path): "max_ngram": 3, "max_features": 10, "lowercase": False, + "additional_vocabulary_size": {"text": 0}, } train_ftr = CountVectorsFeaturizer(config) @@ -391,7 +415,7 @@ def test_count_vector_featurizer_persist_load(tmp_path): # load featurizer meta = train_ftr.component_config.copy() meta.update(file_dict) - test_ftr = CountVectorsFeaturizer.load(meta, str(tmp_path)) + test_ftr = CountVectorsFeaturizer.load(meta, str(tmp_path), finetune_mode=False) test_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in test_ftr.vectorizers.items() @@ -439,7 +463,9 @@ def test_count_vector_featurizer_persist_load(tmp_path): def test_count_vectors_featurizer_train(): - featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) + featurizer = CountVectorsFeaturizer.create( + {"additional_vocabulary_size": {"text": 0, "response": 0}}, RasaNLUModelConfig() + ) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) @@ -499,7 +525,9 @@ def test_count_vector_featurizer_use_lemma( sentence_features: List[List[int]], use_lemma: bool, ): - ftr = CountVectorsFeaturizer({"use_lemma": use_lemma}) + ftr = CountVectorsFeaturizer( + {"use_lemma": use_lemma, "additional_vocabulary_size": {"text": 0}} + ) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) @@ -637,3 +665,212 @@ def test_count_vector_featurizer_process_by_attribute( assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None + + +@pytest.mark.parametrize( + "additional_size, text, real_vocabulary_size, total_vocabulary_size", + [(None, "hello my name is John.", 5, 7), (10, "hello my name is John.", 5, 15)], +) +def test_cvf_independent_train_vocabulary_expand( + additional_size: Optional[int], + text: Text, + real_vocabulary_size: int, + total_vocabulary_size: int, +): + + tokenizer = WhitespaceTokenizer() + featurizer = CountVectorsFeaturizer( + { + "additional_vocabulary_size": { + TEXT: additional_size, + RESPONSE: additional_size, + ACTION_TEXT: additional_size, + } + }, + finetune_mode=False, + ) + + train_message = Message( + data={ + TEXT: text, + INTENT: "intent_1", + RESPONSE: text, + ACTION_TEXT: text, + ACTION_NAME: "action_1", + } + ) + data = TrainingData([train_message]) + + tokenizer.train(data) + featurizer.train(data) + + for attribute in [TEXT, RESPONSE, ACTION_TEXT]: + attribute_vocabulary = featurizer.vectorizers[attribute].vocabulary_ + assert len(attribute_vocabulary) == total_vocabulary_size + assert ( + featurizer._get_starting_empty_index(attribute_vocabulary) + == real_vocabulary_size + ) + + for attribute in [INTENT, ACTION_NAME]: + attribute_vocabulary = featurizer.vectorizers[attribute].vocabulary_ + assert len(attribute_vocabulary) == 1 + + +@pytest.mark.parametrize( + "additional_size, text, real_vocabulary_size, total_vocabulary_size", + [(None, "hello my name is John.", 7, 10), (10, "hello my name is John.", 7, 17)], +) +def test_cvf_shared_train_vocabulary_expand( + additional_size: Optional[int], + text: Text, + real_vocabulary_size: int, + total_vocabulary_size: int, +): + + tokenizer = WhitespaceTokenizer() + featurizer = CountVectorsFeaturizer( + { + "additional_vocabulary_size": { + "text": additional_size, + "response": additional_size, + "action_text": additional_size, + }, + "use_shared_vocab": True, + }, + finetune_mode=False, + ) + + train_message = Message( + data={ + TEXT: text, + INTENT: "intent_1", + RESPONSE: text, + ACTION_TEXT: text, + ACTION_NAME: "action_1", + } + ) + data = TrainingData([train_message]) + + tokenizer.train(data) + featurizer.train(data) + + shared_vocabulary = featurizer.vectorizers["text"].vocabulary_ + assert len(shared_vocabulary) == total_vocabulary_size + assert ( + featurizer._get_starting_empty_index(shared_vocabulary) == real_vocabulary_size + ) + + +@pytest.mark.parametrize( + "additional_size, original_train_text, additional_train_text, total_vocabulary_size, remaining_buffer_size", + [ + (10, "hello my name is John.", "I am also new.", 15, 6), + (None, "hello my name is John.", "I am also new.", 7, 0), + ], +) +def test_cvf_incremental_train_vocabulary( + additional_size: Optional[int], + original_train_text: Text, + additional_train_text: Text, + total_vocabulary_size: int, + remaining_buffer_size: int, + tmp_path: Path, +): + + tokenizer = WhitespaceTokenizer() + original_featurizer = CountVectorsFeaturizer( + {"additional_vocabulary_size": {"text": additional_size}}, finetune_mode=False, + ) + train_message = Message(data={"text": original_train_text}) + data = TrainingData([train_message]) + + tokenizer.train(data) + original_featurizer.train(data) + + # Check total vocabulary size with buffer slots before finetuning + original_vocabulary = original_featurizer.vectorizers["text"].vocabulary_ + assert len(original_vocabulary) == total_vocabulary_size + + file_dict = original_featurizer.persist("ftr", str(tmp_path)) + + # load original_featurizer + meta = original_featurizer.component_config.copy() + meta.update(file_dict) + new_featurizer = CountVectorsFeaturizer.load( + meta, str(tmp_path), finetune_mode=True + ) + + # Check total vocabulary size with buffer slots before finetuning + assert len(new_featurizer.vectorizers["text"].vocabulary_) == total_vocabulary_size + + additional_train_message = Message(data={"text": additional_train_text}) + data = TrainingData([train_message, additional_train_message]) + tokenizer.train(data) + new_featurizer.train(data) + + new_vocabulary = new_featurizer.vectorizers["text"].vocabulary_ + + # Check total vocabulary size with buffer slots after finetuning + assert len(new_vocabulary) == total_vocabulary_size + + # Check remaining buffer slots after finetuning + assert ( + len(new_vocabulary) - new_featurizer._get_starting_empty_index(new_vocabulary) + == remaining_buffer_size + ) + + # Check indices of original vocabulary haven't changed in the new vocabulary + for vocab_token, vocab_index in original_vocabulary.items(): + if not vocab_token.startswith("buf_"): + assert vocab_token in new_vocabulary + assert new_vocabulary.get(vocab_token) == vocab_index + + +@pytest.mark.parametrize( + "additional_size, original_train_text, additional_train_text, overflow", + [ + (10, "hello my name is John.", "I am also new.", False), + (None, "hello my name is John.", "I am also new.", True), + (3, "hello my name is John.", "I am also new.", True), + ], +) +def test_cvf_incremental_train_vocabulary_overflow( + additional_size: Optional[int], + original_train_text: Text, + additional_train_text: Text, + overflow: bool, + tmp_path: Path, + caplog: LogCaptureFixture, +): + tokenizer = WhitespaceTokenizer() + original_featurizer = CountVectorsFeaturizer( + {"additional_vocabulary_size": {"text": additional_size}}, finetune_mode=False, + ) + train_message = Message(data={"text": original_train_text}) + data = TrainingData([train_message]) + + tokenizer.train(data) + original_featurizer.train(data) + + file_dict = original_featurizer.persist("ftr", str(tmp_path)) + + # load original_featurizer + meta = original_featurizer.component_config.copy() + meta.update(file_dict) + new_featurizer = CountVectorsFeaturizer.load( + meta, str(tmp_path), finetune_mode=True + ) + + additional_train_message = Message(data={"text": additional_train_text}) + data = TrainingData([train_message, additional_train_message]) + tokenizer.train(data) + + caplog.set_level(logging.DEBUG) + caplog.clear() + with caplog.at_level(logging.DEBUG): + new_featurizer.train(data) + if overflow: + assert "New data contains vocabulary of size" in caplog.text + else: + assert "New data contains vocabulary of size" not in caplog.text From cad6fa670c8c7541303527c9932f0a000f151fff Mon Sep 17 00:00:00 2001 From: Daksh Date: Wed, 2 Dec 2020 16:42:35 +0100 Subject: [PATCH 03/45] refactor regex featurizers and fix tests --- .../count_vectors_featurizer.py | 13 +- .../sparse_featurizer/regex_featurizer.py | 128 ++++++++++++++++-- .../nlu/featurizers/test_regex_featurizer.py | 15 +- 3 files changed, 134 insertions(+), 22 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index d0b675d0abcb..d3187014f3f0 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -163,7 +163,7 @@ def _load_vocabulary_params(self) -> None: ] = user_defined_additional_size def _check_attribute_vocabulary(self, attribute: Text) -> bool: - """Check if trained vocabulary exists in attribute's count vectorizer""" + """Check if trained vocabulary exists in attribute's count vectorizer.""" try: return hasattr(self.vectorizers[attribute], "vocabulary_") except (AttributeError, TypeError): @@ -222,7 +222,6 @@ def __init__( finetune_mode: bool = False, ) -> None: """Construct a new count vectorizer using the sklearn framework.""" - super().__init__(component_config) # parameters for sklearn's CountVectorizer @@ -418,7 +417,6 @@ def _get_additional_vocabulary_size( Returns: Size of additional vocabulary that should be set aside for incremental training. """ - # Vocabulary expansion for INTENTS, ACTION_NAME # and INTENT_RESPONSE_KEY is currently not supported. if attribute not in DENSE_FEATURIZABLE_ATTRIBUTES: @@ -452,7 +450,7 @@ def _add_buffer_to_vocabulary(self, attribute: Text) -> None: def _set_vocabulary( self, attribute: Text, original_vocabulary: Dict[Text, int] ) -> None: - """Set the vocabulary of the vectorizer of attribute + """Set the vocabulary of the vectorizer of attribute. Args: attribute: Message attribute for which vocabulary should be set @@ -487,7 +485,7 @@ def _attribute_texts_is_non_empty(attribute_texts: List[Text]) -> bool: return any(attribute_texts) def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]): - """Construct the vectorizers and train them with a shared vocab""" + """Construct the vectorizers and train them with a shared vocab.""" combined_cleaned_texts = [] for attribute in self._attributes: combined_cleaned_texts += attribute_texts[attribute] @@ -511,8 +509,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]): self._fit_loaded_vectorizer(TEXT, combined_cleaned_texts) def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]): - """Construct the vectorizers and train them with an independent vocab""" - + """Construct the vectorizers and train them with an independent vocab.""" if not self.finetune_mode: self.vectorizers = self._create_independent_vocab_vectorizers( { @@ -851,7 +848,7 @@ def load( **kwargs: Any, ) -> "CountVectorsFeaturizer": - finetune_mode = kwargs.pop("finetune_mode") + finetune_mode = kwargs.pop("finetune_mode", False) file_name = meta.get("file") featurizer_file = os.path.join(model_dir, file_name) diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 32114d05b39e..fd3921428c5a 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -26,6 +26,7 @@ from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message +from rasa.shared.exceptions import RasaException logger = logging.getLogger(__name__) @@ -42,18 +43,99 @@ def required_components(cls) -> List[Type[Component]]: "use_lookup_tables": True, # use regexes to generate features "use_regexes": True, + # Additional number of patterns to consider + # for incremental training + "number_additional_patterns": None, } def __init__( self, component_config: Optional[Dict[Text, Any]] = None, known_patterns: Optional[List[Dict[Text, Text]]] = None, + pattern_vocabulary_stats: Optional[Dict[Text, int]] = None, + finetune_mode: bool = False, ) -> None: super().__init__(component_config) self.known_patterns = known_patterns if known_patterns else [] self.case_sensitive = self.component_config["case_sensitive"] + self.number_additional_patterns = self.component_config[ + "number_additional_patterns" + ] + self.finetune_mode = finetune_mode + self.pattern_vocabulary_stats = pattern_vocabulary_stats + + if self.finetune_mode and not self.pattern_vocabulary_stats: + # If the featurizer is instantiated in finetune mode, + # the vocabulary stats for it should be known. + raise RasaException( + f"{self.__class__.__name__} was instantiated with" + f" `finetune_mode=True` but `pattern_vocabulary_stats`" + f" was left to `None`. This is invalid since the featurizer" + f" needs vocabulary statistics to featurize in finetune mode." + ) + + def _get_vocabulary_stats(self) -> Dict[Text, int]: + """ + + Returns: + + """ + if not self.finetune_mode: + max_number_patterns = ( + len(self.known_patterns) + self.number_additional_patterns + ) + return { + "pattern_slots_filled": len(self.known_patterns), + "max_number_patterns": max_number_patterns, + } + else: + return self.pattern_vocabulary_stats + + def _merge_with_existing_patterns(self, new_patterns: List[Dict[Text, Text]]): + """ + + Args: + new_patterns: + + Returns: + + """ + max_number_patterns = self.pattern_vocabulary_stats["max_number_patterns"] + pattern_name_index_map = { + pattern["name"]: index for index, pattern in enumerate(self.known_patterns) + } + patterns_dropped = False + + for extra_pattern in new_patterns: + new_pattern_name = extra_pattern["name"] + + # Some patterns may have just new examples added + # to them. These do not count as additional pattern. + if new_pattern_name in pattern_name_index_map: + self.known_patterns[pattern_name_index_map[new_pattern_name]][ + "pattern" + ] = extra_pattern["pattern"] + else: + if len(self.known_patterns) == max_number_patterns: + patterns_dropped = True + continue + self.known_patterns.extend(extra_pattern) + if patterns_dropped: + logger.warning( + f"The originally trained model was configured to " + f"handle a maximum number of {max_number_patterns} patterns. " + f"The current training data exceeds this number as " + f"there are {len(new_patterns)} patterns in total. " + f"Some patterns will be dropped and not used for " + f"featurization. It is advisable to re-train the " + f"model from scratch." + ) + + def _compute_num_additional_slots(self) -> None: + if self.number_additional_patterns is None: + self.number_additional_patterns = max(10, len(self.known_patterns) * 2) def train( self, @@ -62,11 +144,17 @@ def train( **kwargs: Any, ) -> None: - self.known_patterns = pattern_utils.extract_patterns( + patterns_from_data = pattern_utils.extract_patterns( training_data, use_lookup_tables=self.component_config["use_lookup_tables"], use_regexes=self.component_config["use_regexes"], ) + if self.finetune_mode: + # Merge patterns extracted from data with known patterns + self._merge_with_existing_patterns(patterns_from_data) + else: + self.known_patterns = patterns_from_data + self._compute_num_additional_slots() for example in training_data.training_examples: for attribute in [TEXT, RESPONSE, ACTION_TEXT]: @@ -124,8 +212,10 @@ def _features_for_patterns( sequence_length = len(tokens) - sequence_features = np.zeros([sequence_length, len(self.known_patterns)]) - sentence_features = np.zeros([1, len(self.known_patterns)]) + max_number_patterns = self._get_vocabulary_stats()["max_number_patterns"] + + sequence_features = np.zeros([sequence_length, max_number_patterns]) + sentence_features = np.zeros([1, max_number_patterns]) for pattern_index, pattern in enumerate(self.known_patterns): matches = re.finditer(pattern["pattern"], message.get(TEXT), flags=flags) @@ -160,20 +250,40 @@ def load( **kwargs: Any, ) -> "RegexFeaturizer": + finetune_mode = kwargs.pop("finetune_mode", False) + file_name = meta.get("file") - regex_file = os.path.join(model_dir, file_name) + patterns_file_name = file_name + ".patterns.pkl" + regex_file = os.path.join(model_dir, patterns_file_name) + + vocabulary_stats_file_name = file_name + ".vocabulary_stats.pkl" + vocabulary_file = os.path.join(model_dir, vocabulary_stats_file_name) + + known_patterns = None + vocabulary_stats = None if os.path.exists(regex_file): known_patterns = rasa.shared.utils.io.read_json_file(regex_file) - return RegexFeaturizer(meta, known_patterns=known_patterns) - else: - return RegexFeaturizer(meta) + if os.path.exists(vocabulary_file): + vocabulary_stats = rasa.shared.utils.io.read_json_file(vocabulary_file) + + return RegexFeaturizer( + meta, + known_patterns=known_patterns, + pattern_vocabulary_stats=vocabulary_stats, + finetune_mode=finetune_mode, + ) def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]: """Persist this model into the passed directory. Return the metadata necessary to load the model again.""" - file_name = file_name + ".pkl" - regex_file = os.path.join(model_dir, file_name) + patterns_file_name = file_name + ".patterns.pkl" + regex_file = os.path.join(model_dir, patterns_file_name) utils.write_json_to_file(regex_file, self.known_patterns, indent=4) + vocabulary_stats_file_name = file_name + ".vocabulary_stats.pkl" + vocabulary_file = os.path.join(model_dir, vocabulary_stats_file_name) + utils.write_json_to_file( + vocabulary_file, self._get_vocabulary_stats(), indent=4 + ) return {"file": file_name} diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index 29079cd1cd16..d1714119cd11 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -72,7 +72,7 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] - ftr = RegexFeaturizer({}, known_patterns=patterns) + ftr = RegexFeaturizer({"number_additional_patterns": 0}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer({}) @@ -132,7 +132,7 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): }, {"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}, ] - ftr = RegexFeaturizer() + ftr = RegexFeaturizer({"number_additional_patterns": 0}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) @@ -173,7 +173,7 @@ def test_regex_featurizer_no_sequence(sentence, expected, expected_cls, spacy_nl {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] - ftr = RegexFeaturizer({}, known_patterns=patterns) + ftr = RegexFeaturizer({"number_additional_patterns": 0}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() @@ -194,7 +194,9 @@ def test_regex_featurizer_train(): {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] - featurizer = RegexFeaturizer.create({}, RasaNLUModelConfig()) + featurizer = RegexFeaturizer.create( + {"number_additional_patterns": 0}, RasaNLUModelConfig() + ) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) @@ -263,7 +265,10 @@ def test_regex_featurizer_case_sensitive( {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] - ftr = RegexFeaturizer({"case_sensitive": case_sensitive}, known_patterns=patterns) + ftr = RegexFeaturizer( + {"case_sensitive": case_sensitive, "number_additional_patterns": 0}, + known_patterns=patterns, + ) # adds tokens to the message tokenizer = SpacyTokenizer() From 4590b8507ce1bb0b55cdd3af2ab035393a8ae4f2 Mon Sep 17 00:00:00 2001 From: Daksh Date: Thu, 3 Dec 2020 15:45:41 +0100 Subject: [PATCH 04/45] added tests for regex featurizer, comments and doc strings --- .../sparse_featurizer/regex_featurizer.py | 95 +++++-- .../test_count_vectors_featurizer.py | 4 +- .../nlu/featurizers/test_regex_featurizer.py | 233 ++++++++++++++++-- 3 files changed, 285 insertions(+), 47 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index fd3921428c5a..01c8e8902ca4 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -27,6 +27,7 @@ from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message from rasa.shared.exceptions import RasaException +from rasa.shared.utils.common import lazy_property logger = logging.getLogger(__name__) @@ -55,7 +56,14 @@ def __init__( pattern_vocabulary_stats: Optional[Dict[Text, int]] = None, finetune_mode: bool = False, ) -> None: + """Construct a new regex pattern based binary featurizer. + Args: + component_config: Configuration for the component + known_patterns: Regex Patterns the component should pre-load itself with. + pattern_vocabulary_stats: Statistics about number of pattern slots filled and total number available. + finetune_mode: Load component in finetune mode. + """ super().__init__(component_config) self.known_patterns = known_patterns if known_patterns else [] @@ -76,31 +84,32 @@ def __init__( f" needs vocabulary statistics to featurize in finetune mode." ) - def _get_vocabulary_stats(self) -> Dict[Text, int]: - """ + @lazy_property + def vocabulary_stats(self) -> Dict[Text, int]: + """Compute total vocabulary size and how much of it is consumed. Returns: - + Computed vocabulary size and number of filled vocabulary slots. """ if not self.finetune_mode: max_number_patterns = ( - len(self.known_patterns) + self.number_additional_patterns + len(self.known_patterns) + self._get_num_additional_slots() ) return { "pattern_slots_filled": len(self.known_patterns), "max_number_patterns": max_number_patterns, } else: + self.pattern_vocabulary_stats["pattern_slots_filled"] = len( + self.known_patterns + ) return self.pattern_vocabulary_stats - def _merge_with_existing_patterns(self, new_patterns: List[Dict[Text, Text]]): - """ + def _merge_new_patterns(self, new_patterns: List[Dict[Text, Text]]): + """Update already known patterns with new patterns extracted from data. Args: - new_patterns: - - Returns: - + new_patterns: Patterns extracted from training data and to be merged with known patterns. """ max_number_patterns = self.pattern_vocabulary_stats["max_number_patterns"] pattern_name_index_map = { @@ -121,7 +130,7 @@ def _merge_with_existing_patterns(self, new_patterns: List[Dict[Text, Text]]): if len(self.known_patterns) == max_number_patterns: patterns_dropped = True continue - self.known_patterns.extend(extra_pattern) + self.known_patterns.append(extra_pattern) if patterns_dropped: logger.warning( f"The originally trained model was configured to " @@ -133,9 +142,11 @@ def _merge_with_existing_patterns(self, new_patterns: List[Dict[Text, Text]]): f"model from scratch." ) - def _compute_num_additional_slots(self) -> None: + def _get_num_additional_slots(self) -> int: + """Compute number of additional pattern slots available in vocabulary on top of known patterns.""" if self.number_additional_patterns is None: self.number_additional_patterns = max(10, len(self.known_patterns) * 2) + return self.number_additional_patterns def train( self, @@ -143,7 +154,13 @@ def train( config: Optional[RasaNLUModelConfig] = None, **kwargs: Any, ) -> None: + """Train the component with all patterns extracted from training data. + Args: + training_data: Training data consisting of training examples and patterns available. + config: NLU Pipeline config + **kwargs: Any other arguments + """ patterns_from_data = pattern_utils.extract_patterns( training_data, use_lookup_tables=self.component_config["use_lookup_tables"], @@ -151,10 +168,9 @@ def train( ) if self.finetune_mode: # Merge patterns extracted from data with known patterns - self._merge_with_existing_patterns(patterns_from_data) + self._merge_new_patterns(patterns_from_data) else: self.known_patterns = patterns_from_data - self._compute_num_additional_slots() for example in training_data.training_examples: for attribute in [TEXT, RESPONSE, ACTION_TEXT]: @@ -164,6 +180,12 @@ def process(self, message: Message, **kwargs: Any) -> None: self._text_features_with_regex(message, TEXT) def _text_features_with_regex(self, message: Message, attribute: Text) -> None: + """Helper method to extract features and set them appropriately in the message object. + + Args: + message: Message to be featurized. + attribute: Attribute of message to be featurized. + """ if self.known_patterns: sequence_features, sentence_features = self._features_for_patterns( message, attribute @@ -191,11 +213,19 @@ def _features_for_patterns( self, message: Message, attribute: Text ) -> Tuple[Optional[scipy.sparse.coo_matrix], Optional[scipy.sparse.coo_matrix]]: """Checks which known patterns match the message. + Given a sentence, returns a vector of {1,0} values indicating which regexes did match. Furthermore, if the message is tokenized, the function will mark all tokens with a dict - relating the name of the regex to whether it was matched.""" + relating the name of the regex to whether it was matched. + Args: + message: Message to be featurized. + attribute: Attribute of message to be featurized. + + Returns: + Token and sentence level features of message attribute. + """ # Attribute not set (e.g. response not present) if not message.get(attribute): return None, None @@ -212,7 +242,7 @@ def _features_for_patterns( sequence_length = len(tokens) - max_number_patterns = self._get_vocabulary_stats()["max_number_patterns"] + max_number_patterns = self.vocabulary_stats["max_number_patterns"] sequence_features = np.zeros([sequence_length, max_number_patterns]) sentence_features = np.zeros([1, max_number_patterns]) @@ -249,23 +279,31 @@ def load( cached_component: Optional["RegexFeaturizer"] = None, **kwargs: Any, ) -> "RegexFeaturizer": + """Load a previously trained component. + Args: + meta: Configuration of trained component. + model_dir: Path where trained pipeline is stored. + model_metadata: Metadata for the trained pipeline. + cached_component: Previously cached component(if any). + **kwargs: + """ finetune_mode = kwargs.pop("finetune_mode", False) file_name = meta.get("file") patterns_file_name = file_name + ".patterns.pkl" - regex_file = os.path.join(model_dir, patterns_file_name) vocabulary_stats_file_name = file_name + ".vocabulary_stats.pkl" - vocabulary_file = os.path.join(model_dir, vocabulary_stats_file_name) known_patterns = None vocabulary_stats = None - if os.path.exists(regex_file): - known_patterns = rasa.shared.utils.io.read_json_file(regex_file) - if os.path.exists(vocabulary_file): - vocabulary_stats = rasa.shared.utils.io.read_json_file(vocabulary_file) + if os.path.exists(patterns_file_name): + known_patterns = rasa.shared.utils.io.read_json_file(patterns_file_name) + if os.path.exists(vocabulary_stats_file_name): + vocabulary_stats = rasa.shared.utils.io.read_json_file( + vocabulary_stats_file_name + ) return RegexFeaturizer( meta, @@ -276,14 +314,19 @@ def load( def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]: """Persist this model into the passed directory. - Return the metadata necessary to load the model again.""" + + Args: + file_name: Prefix to add to all files stored as part of this component. + model_dir: Path where files should be stored. + + Returns: + Metadata necessary to load the model again. + """ patterns_file_name = file_name + ".patterns.pkl" regex_file = os.path.join(model_dir, patterns_file_name) utils.write_json_to_file(regex_file, self.known_patterns, indent=4) vocabulary_stats_file_name = file_name + ".vocabulary_stats.pkl" vocabulary_file = os.path.join(model_dir, vocabulary_stats_file_name) - utils.write_json_to_file( - vocabulary_file, self._get_vocabulary_stats(), indent=4 - ) + utils.write_json_to_file(vocabulary_file, self.vocabulary_stats, indent=4) return {"file": file_name} diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index 62c911cc4cdf..533f3bdbf04f 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -866,9 +866,9 @@ def test_cvf_incremental_train_vocabulary_overflow( data = TrainingData([train_message, additional_train_message]) tokenizer.train(data) - caplog.set_level(logging.DEBUG) + caplog.set_level(logging.WARNING) caplog.clear() - with caplog.at_level(logging.DEBUG): + with caplog.at_level(logging.WARNING): new_featurizer.train(data) if overflow: assert "New data contains vocabulary of size" in caplog.text diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index d1714119cd11..6668c1a3fae6 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -2,6 +2,10 @@ import numpy as np import pytest +from pathlib import Path +from _pytest.logging import LogCaptureFixture +import os +import logging from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message @@ -14,19 +18,20 @@ @pytest.mark.parametrize( - "sentence, expected, labeled_tokens", + "sentence, expected, labeled_tokens, additional_vocabulary_size", [ ( "hey how are you today", [ - [0.0, 1.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 1.0, 0.0], + [0.0, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0, 0.0], ], [0], + 2, ), ( "hey 456 how are you", @@ -39,32 +44,41 @@ [1.0, 1.0, 0.0], ], [1, 0], + 0, ), ( "blah balh random eh", [ - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ], [], + None, ), ( "a 1 digit number", [ - [0.0, 0.0, 0.0], - [1.0, 0.0, 1.0], - [0.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - [1.0, 0.0, 1.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ], [1, 1], + None, ), ], ) -def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): +def test_regex_featurizer( + sentence: Text, + expected: List[float], + labeled_tokens: List[int], + additional_vocabulary_size: int, + spacy_nlp: Any, +): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer patterns = [ @@ -72,7 +86,10 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] - ftr = RegexFeaturizer({"number_additional_patterns": 0}, known_patterns=patterns) + ftr = RegexFeaturizer( + {"number_additional_patterns": additional_vocabulary_size}, + known_patterns=patterns, + ) # adds tokens to the message tokenizer = SpacyTokenizer({}) @@ -279,3 +296,181 @@ def test_regex_featurizer_case_sensitive( sequence_featrures, sentence_features = ftr._features_for_patterns(message, TEXT) assert np.allclose(sequence_featrures.toarray()[0], sequence_vector, atol=1e-10) assert np.allclose(sentence_features.toarray()[-1], sentence_vector, atol=1e-10) + + +def test_persist_load(tmp_path: Path): + + patterns = [ + {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, + {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, + {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, + ] + + featurizer = RegexFeaturizer.create( + {"number_additional_patterns": 5}, RasaNLUModelConfig() + ) + + sentence = "hey how are you today 19.12.2019 ?" + message = Message(data={TEXT: sentence}) + message.set(RESPONSE, sentence) + message.set(INTENT, "intent") + WhitespaceTokenizer().train(TrainingData([message])) + + featurizer.train( + TrainingData([message], regex_features=patterns), RasaNLUModelConfig() + ) + + persist_value = featurizer.persist("ftr", str(tmp_path)) + + # Test all artifacts stored as part of persist + assert persist_value["file"] == "ftr" + assert os.path.exists(os.path.join(str(tmp_path), "ftr.patterns.pkl")) + assert os.path.exists(os.path.join(str(tmp_path), "ftr.vocabulary_stats.pkl")) + assert featurizer.vocabulary_stats == { + "max_number_patterns": 8, + "pattern_slots_filled": 3, + } + + loaded_featurizer = RegexFeaturizer.load( + meta={ + "number_additional_patterns": 5, + "file": os.path.join(str(tmp_path), persist_value["file"]), + }, + finetune_mode=True, + ) + + # Test component loaded in finetune mode and also with + # same patterns as before and vocabulary statistics + assert loaded_featurizer.known_patterns == featurizer.known_patterns + assert loaded_featurizer.finetune_mode + assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats + + new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}] + + training_data = TrainingData() + training_data.lookup_tables = new_lookups + loaded_featurizer.train(training_data) + + # Test merging of a new pattern to an already trained component. + assert len(loaded_featurizer.known_patterns) == 4 + assert loaded_featurizer.vocabulary_stats == { + "max_number_patterns": 8, + "pattern_slots_filled": 4, + } + + +def test_incremental_train_featurization(tmp_path: Path): + + patterns = [ + {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, + {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, + {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, + ] + + featurizer = RegexFeaturizer.create( + {"number_additional_patterns": 5}, RasaNLUModelConfig() + ) + + sentence = "hey how are you today 19.12.2019 ?" + message = Message(data={TEXT: sentence}) + message.set(RESPONSE, sentence) + message.set(INTENT, "intent") + WhitespaceTokenizer().train(TrainingData([message])) + + featurizer.train( + TrainingData([message], regex_features=patterns), RasaNLUModelConfig() + ) + + # Test featurization of message + expected = np.array([0, 1, 0, 0, 0, 0, 0, 0]) + expected_cls = np.array([1, 1, 1, 0, 0, 0, 0, 0]) + + seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) + if seq_vecs: + seq_vecs = seq_vecs.features + if sen_vec: + sen_vec = sen_vec.features + + assert (6, 8) == seq_vecs.shape + assert (1, 8) == sen_vec.shape + assert np.all(seq_vecs.toarray()[0] == expected) + assert np.all(sen_vec.toarray()[-1] == expected_cls) + + persist_value = featurizer.persist("ftr", str(tmp_path)) + loaded_featurizer = RegexFeaturizer.load( + meta={ + "number_additional_patterns": 5, + "file": os.path.join(str(tmp_path), persist_value["file"]), + }, + finetune_mode=True, + ) + + new_patterns = [ + {"pattern": "\\btoday*", "name": "day", "usage": "intent"}, + {"pattern": "\\bhey+", "name": "hello", "usage": "intent"}, + ] + + message = Message(data={TEXT: sentence}) + message.set(RESPONSE, sentence) + message.set(INTENT, "intent") + WhitespaceTokenizer().train(TrainingData([message])) + + loaded_featurizer.train( + TrainingData([message], regex_features=patterns + new_patterns), + RasaNLUModelConfig(), + ) + + # Test featurization of message, this time for the extra pattern as well. + expected_token_1 = np.array([0, 1, 0, 0, 0, 0, 0, 0]) + expected_token_2 = np.array([0, 0, 0, 1, 0, 0, 0, 0]) + expected_cls = np.array([1, 1, 1, 1, 0, 0, 0, 0]) + + seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) + if seq_vecs: + seq_vecs = seq_vecs.features + if sen_vec: + sen_vec = sen_vec.features + + assert (6, 8) == seq_vecs.shape + assert (1, 8) == sen_vec.shape + assert np.all(seq_vecs.toarray()[0] == expected_token_1) + assert np.all(seq_vecs.toarray()[-2] == expected_token_2) + assert np.all(sen_vec.toarray()[-1] == expected_cls) + + # we also modified a pattern, check if that is correctly modified + pattern_to_check = [ + pattern + for pattern in loaded_featurizer.known_patterns + if pattern["name"] == "hello" + ] + assert pattern_to_check == [new_patterns[1]] + + +def test_vocabulary_overflow_log(caplog: LogCaptureFixture): + + patterns = [ + {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, + {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, + {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, + ] + + featurizer = RegexFeaturizer( + {"number_additional_patterns": 1}, + known_patterns=patterns, + finetune_mode=True, + pattern_vocabulary_stats={"max_number_patterns": 4, "pattern_slots_filled": 3}, + ) + + additional_patterns = [ + {"pattern": "\\btoday*", "name": "day", "usage": "intent"}, + {"pattern": "\\bhello+", "name": "greet", "usage": "intent"}, + ] + + caplog.set_level(logging.WARNING) + caplog.clear() + with caplog.at_level(logging.WARNING): + featurizer.train(TrainingData([], regex_features=additional_patterns)) + assert ( + "The originally trained model was configured to handle a maximum number of 4 patterns" + in caplog.text + ) From d3bf72532796a97808fc28fd03d9f25e67167a5b Mon Sep 17 00:00:00 2001 From: Daksh Date: Thu, 3 Dec 2020 17:19:19 +0100 Subject: [PATCH 05/45] rename 'finetune_mode' parameter inside load --- .../featurizers/sparse_featurizer/count_vectors_featurizer.py | 2 +- rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py | 2 +- tests/nlu/featurizers/test_count_vectors_featurizer.py | 4 ++-- tests/nlu/featurizers/test_regex_featurizer.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index d3187014f3f0..56a185b607e4 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -848,7 +848,7 @@ def load( **kwargs: Any, ) -> "CountVectorsFeaturizer": - finetune_mode = kwargs.pop("finetune_mode", False) + finetune_mode = kwargs.pop("should_finetune", False) file_name = meta.get("file") featurizer_file = os.path.join(model_dir, file_name) diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 01c8e8902ca4..133ba278c03d 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -288,7 +288,7 @@ def load( cached_component: Previously cached component(if any). **kwargs: """ - finetune_mode = kwargs.pop("finetune_mode", False) + finetune_mode = kwargs.pop("should_finetune", False) file_name = meta.get("file") diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index 533f3bdbf04f..7aa5104e0eed 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -798,7 +798,7 @@ def test_cvf_incremental_train_vocabulary( meta = original_featurizer.component_config.copy() meta.update(file_dict) new_featurizer = CountVectorsFeaturizer.load( - meta, str(tmp_path), finetune_mode=True + meta, str(tmp_path), should_finetune=True ) # Check total vocabulary size with buffer slots before finetuning @@ -859,7 +859,7 @@ def test_cvf_incremental_train_vocabulary_overflow( meta = original_featurizer.component_config.copy() meta.update(file_dict) new_featurizer = CountVectorsFeaturizer.load( - meta, str(tmp_path), finetune_mode=True + meta, str(tmp_path), should_finetune=True ) additional_train_message = Message(data={"text": additional_train_text}) diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index 6668c1a3fae6..81b0c0cc331b 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -336,7 +336,7 @@ def test_persist_load(tmp_path: Path): "number_additional_patterns": 5, "file": os.path.join(str(tmp_path), persist_value["file"]), }, - finetune_mode=True, + should_finetune=True, ) # Test component loaded in finetune mode and also with @@ -402,7 +402,7 @@ def test_incremental_train_featurization(tmp_path: Path): "number_additional_patterns": 5, "file": os.path.join(str(tmp_path), persist_value["file"]), }, - finetune_mode=True, + should_finetune=True, ) new_patterns = [ From a8555c5030d044f2c4c0fdb80f516ea59cb5de72 Mon Sep 17 00:00:00 2001 From: Daksh Date: Thu, 3 Dec 2020 19:05:36 +0100 Subject: [PATCH 06/45] address review comments, make ML components inside NLU loadable in finetune mode. --- rasa/nlu/classifiers/diet_classifier.py | 59 ++++++++++++++----- rasa/nlu/constants.py | 1 + .../count_vectors_featurizer.py | 9 ++- .../sparse_featurizer/regex_featurizer.py | 17 +++++- rasa/nlu/selectors/response_selector.py | 9 ++- rasa/utils/tensorflow/models.py | 15 +++-- 6 files changed, 86 insertions(+), 24 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index d16361f332ce..8a9f27c65815 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -308,6 +308,7 @@ def __init__( index_label_id_mapping: Optional[Dict[int, Text]] = None, entity_tag_specs: Optional[List[EntityTagSpec]] = None, model: Optional[RasaModel] = None, + finetune_mode: bool = False, ) -> None: """Declare instance variables with default values.""" @@ -333,6 +334,17 @@ def __init__( self.split_entities_config = self.init_split_entities() + self.finetune_mode = finetune_mode + + if not self.model and self.finetune_mode: + raise rasa.utils.exceptions.RasaException( + f"{self.__class__.__name__} was instantiated " + f"with `model=None` and `finetune_mode=True`. " + f"This is not a valid combination as the component " + f"needs an already instantiated and trained model " + f"to continue training in finetune mode." + ) + @property def label_key(self) -> Optional[Text]: """Return key if intent classification is activated.""" @@ -766,7 +778,9 @@ def train( # keep one example for persisting and loading self._data_example = model_data.first_data_example() - self.model = self._instantiate_model_class(model_data) + if not self.finetune_mode: + # No pre-trained model to load from. Create a new instance of the model. + self.model = self._instantiate_model_class(model_data) self.model.fit( model_data, @@ -959,7 +973,6 @@ def load( **kwargs: Any, ) -> "DIETClassifier": """Loads the trained model from the provided directory.""" - if not model_dir or not meta.get("file"): logger.debug( f"Failed to load model for '{cls.__name__}'. " @@ -968,6 +981,8 @@ def load( ) return cls(component_config=meta) + finetune_mode = kwargs.pop("should_finetune", False) + ( index_label_id_mapping, entity_tag_specs, @@ -979,7 +994,12 @@ def load( meta = train_utils.update_similarity_type(meta) model = cls._load_model( - entity_tag_specs, label_data, meta, data_example, model_dir + entity_tag_specs, + label_data, + meta, + data_example, + model_dir, + finetune_mode=finetune_mode, ) return cls( @@ -987,6 +1007,7 @@ def load( index_label_id_mapping=index_label_id_mapping, entity_tag_specs=entity_tag_specs, model=model, + finetune_mode=finetune_mode, ) @classmethod @@ -1039,6 +1060,7 @@ def _load_model( meta: Dict[Text, Any], data_example: Dict[Text, Dict[Text, List[np.ndarray]]], model_dir: Text, + finetune_mode: bool = False, ) -> "RasaModel": file_name = meta.get("file") tf_model_file = os.path.join(model_dir, file_name + ".tf_model") @@ -1051,20 +1073,27 @@ def _load_model( ) model = cls._load_model_class( - tf_model_file, model_data_example, label_data, entity_tag_specs, meta + tf_model_file, + model_data_example, + label_data, + entity_tag_specs, + meta, + finetune_mode=finetune_mode, ) - # build the graph for prediction - predict_data_example = RasaModelData( - label_key=label_key, - data={ - feature_name: features - for feature_name, features in model_data_example.items() - if TEXT in feature_name - }, - ) + if not finetune_mode: + + # build the graph for prediction + predict_data_example = RasaModelData( + label_key=label_key, + data={ + feature_name: features + for feature_name, features in model_data_example.items() + if TEXT in feature_name + }, + ) - model.build_for_predict(predict_data_example) + model.build_for_predict(predict_data_example) return model @@ -1076,6 +1105,7 @@ def _load_model_class( label_data: RasaModelData, entity_tag_specs: List[EntityTagSpec], meta: Dict[Text, Any], + finetune_mode: bool, ) -> "RasaModel": return cls.model_class().load( @@ -1085,6 +1115,7 @@ def _load_model_class( label_data=label_data, entity_tag_specs=entity_tag_specs, config=copy.deepcopy(meta), + finetune_mode=finetune_mode, ) def _instantiate_model_class(self, model_data: RasaModelData) -> "RasaModel": diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py index 14297822acb3..d3a5b1ecec5c 100644 --- a/rasa/nlu/constants.py +++ b/rasa/nlu/constants.py @@ -78,3 +78,4 @@ FEATURIZER_CLASS_ALIAS = "alias" NO_LENGTH_RESTRICTION = -1 +MIN_ADDITIONAL_REGEX_PATTERNS = 10 diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 56a185b607e4..5c1318b6f6ba 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -410,6 +410,7 @@ def _get_additional_vocabulary_size( this number. If not then we take the default additional vocabulary size which is 1/2 of the current vocabulary size. + Args: attribute: Message attribute for which additional vocabulary size should be computed. existing_vocabulary_size: Current size of vocabulary learnt from the training data. @@ -418,10 +419,12 @@ def _get_additional_vocabulary_size( Size of additional vocabulary that should be set aside for incremental training. """ # Vocabulary expansion for INTENTS, ACTION_NAME - # and INTENT_RESPONSE_KEY is currently not supported. + # and INTENT_RESPONSE_KEY is currently not supported as + # incremental training does not support creation/deletion + # of new/existing labels(intents, actions, etc.) if attribute not in DENSE_FEATURIZABLE_ATTRIBUTES: return 0 - if self.additional_vocabulary_size.get(attribute, None) is not None: + if self.additional_vocabulary_size.get(attribute) is not None: return self.additional_vocabulary_size[attribute] return int(existing_vocabulary_size * 0.5) @@ -438,7 +441,7 @@ def _add_buffer_to_vocabulary(self, attribute: Text) -> None: """ original_vocabulary = self.vectorizers[attribute].vocabulary_ - current_vocabulary_size = self._get_starting_empty_index(original_vocabulary) + current_vocabulary_size = len(original_vocabulary) for index in range( current_vocabulary_size, current_vocabulary_size diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 133ba278c03d..958f4d94f9b8 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -12,7 +12,11 @@ from rasa.nlu import utils from rasa.nlu.components import Component from rasa.nlu.config import RasaNLUModelConfig -from rasa.nlu.constants import TOKENS_NAMES, FEATURIZER_CLASS_ALIAS +from rasa.nlu.constants import ( + TOKENS_NAMES, + FEATURIZER_CLASS_ALIAS, + MIN_ADDITIONAL_REGEX_PATTERNS, +) from rasa.shared.nlu.constants import ( TEXT, RESPONSE, @@ -145,7 +149,16 @@ def _merge_new_patterns(self, new_patterns: List[Dict[Text, Text]]): def _get_num_additional_slots(self) -> int: """Compute number of additional pattern slots available in vocabulary on top of known patterns.""" if self.number_additional_patterns is None: - self.number_additional_patterns = max(10, len(self.known_patterns) * 2) + # We take twice the number of currently defined + # regex patterns as the number of additional + # vocabulary slots to support if this parameter + # is not configured by the user. Also, to avoid having + # to retrain from scratch very often, the default number + # of additional slots is kept to MIN_ADDITIONAL_SLOTS. + # This is an empirically tuned number. + self.number_additional_patterns = max( + MIN_ADDITIONAL_REGEX_PATTERNS, len(self.known_patterns) * 2 + ) return self.number_additional_patterns def train( diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py index 59b8d3b29276..f468449b658b 100644 --- a/rasa/nlu/selectors/response_selector.py +++ b/rasa/nlu/selectors/response_selector.py @@ -240,6 +240,7 @@ def __init__( model: Optional[RasaModel] = None, all_retrieval_intents: Optional[List[Text]] = None, responses: Optional[Dict[Text, List[Dict[Text, Any]]]] = None, + finetune_mode: bool = False, ) -> None: component_config = component_config or {} @@ -256,7 +257,11 @@ def __init__( self.use_text_as_label = False super().__init__( - component_config, index_label_id_mapping, entity_tag_specs, model + component_config, + index_label_id_mapping, + entity_tag_specs, + model, + finetune_mode=finetune_mode, ) @property @@ -458,6 +463,7 @@ def _load_model_class( label_data: RasaModelData, entity_tag_specs: List[EntityTagSpec], meta: Dict[Text, Any], + finetune_mode: bool = False, ) -> "RasaModel": return cls.model_class(meta[USE_TEXT_AS_LABEL]).load( @@ -467,6 +473,7 @@ def _load_model_class( label_data=label_data, entity_tag_specs=entity_tag_specs, config=copy.deepcopy(meta), + finetune_mode=finetune_mode, ) def _instantiate_model_class(self, model_data: RasaModelData) -> "RasaModel": diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py index 54e4148b6f87..089afb9ef2a5 100644 --- a/rasa/utils/tensorflow/models.py +++ b/rasa/utils/tensorflow/models.py @@ -327,9 +327,14 @@ def copy_best(self, model_file_name: Text) -> None: @classmethod def load( - cls, model_file_name: Text, model_data_example: RasaModelData, *args, **kwargs + cls, + model_file_name: Text, + model_data_example: RasaModelData, + finetune_mode=False, + *args, + **kwargs, ) -> "RasaModel": - logger.debug("Loading the model ...") + logger.debug(f"Loading the model with finetune_mode={finetune_mode}...") # create empty model model = cls(*args, **kwargs) # need to train on 1 example to build weights of the correct size @@ -341,8 +346,10 @@ def load( evaluate_on_num_examples=0, batch_strategy=SEQUENCE, silent=True, # don't confuse users with training output - loading=True, # don't use tensorboard while loading - eager=True, # no need to build tf graph, eager is faster here + loading=True, # don't use tensorboard when doing a dummy fit run + eager=False + if finetune_mode + else True, # load in eager mode only for prediction phase ) # load trained weights model.load_weights(model_file_name) From 969750a16d43844dbae2a328fe4bae4ef153a9ce Mon Sep 17 00:00:00 2001 From: Daksh Date: Thu, 3 Dec 2020 23:13:34 +0100 Subject: [PATCH 07/45] try resetting default additional slots in cvf to 0, see if results go back to normal --- .../featurizers/sparse_featurizer/count_vectors_featurizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 5c1318b6f6ba..d0366c2cad41 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -426,7 +426,7 @@ def _get_additional_vocabulary_size( return 0 if self.additional_vocabulary_size.get(attribute) is not None: return self.additional_vocabulary_size[attribute] - return int(existing_vocabulary_size * 0.5) + return int(existing_vocabulary_size * 0) def _add_buffer_to_vocabulary(self, attribute: Text) -> None: """Add extra tokens to vocabulary for incremental training. From 3d469c88a1b2af110b7ae3cef4986551370fc676 Mon Sep 17 00:00:00 2001 From: Daksh Date: Fri, 4 Dec 2020 09:23:18 +0100 Subject: [PATCH 08/45] revert default in regex also, to see if model regression tests pass --- rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 958f4d94f9b8..7eef5db6e740 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -156,9 +156,10 @@ def _get_num_additional_slots(self) -> int: # to retrain from scratch very often, the default number # of additional slots is kept to MIN_ADDITIONAL_SLOTS. # This is an empirically tuned number. - self.number_additional_patterns = max( - MIN_ADDITIONAL_REGEX_PATTERNS, len(self.known_patterns) * 2 - ) + # self.number_additional_patterns = max( + # MIN_ADDITIONAL_REGEX_PATTERNS, len(self.known_patterns) * 2 + # ) + self.number_additional_patterns = 0 return self.number_additional_patterns def train( From e464dfec73d8305c6c7dd1df0dc816bd330bb2cb Mon Sep 17 00:00:00 2001 From: Daksh Date: Fri, 4 Dec 2020 10:34:56 +0100 Subject: [PATCH 09/45] rectify how regex featurizer is loaded --- rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 7eef5db6e740..ab51125e1413 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -306,9 +306,11 @@ def load( file_name = meta.get("file") - patterns_file_name = file_name + ".patterns.pkl" + patterns_file_name = os.path.join(model_dir, file_name + ".patterns.pkl") - vocabulary_stats_file_name = file_name + ".vocabulary_stats.pkl" + vocabulary_stats_file_name = os.path.join( + model_dir, file_name + ".vocabulary_stats.pkl" + ) known_patterns = None vocabulary_stats = None From d5fdb41673ccba4a3056e842a56d4c3d17bf3348 Mon Sep 17 00:00:00 2001 From: Daksh Date: Fri, 4 Dec 2020 10:36:11 +0100 Subject: [PATCH 10/45] revert back defaults for additional vocab params in cvf and regex --- .../sparse_featurizer/count_vectors_featurizer.py | 2 +- rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index d0366c2cad41..5c1318b6f6ba 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -426,7 +426,7 @@ def _get_additional_vocabulary_size( return 0 if self.additional_vocabulary_size.get(attribute) is not None: return self.additional_vocabulary_size[attribute] - return int(existing_vocabulary_size * 0) + return int(existing_vocabulary_size * 0.5) def _add_buffer_to_vocabulary(self, attribute: Text) -> None: """Add extra tokens to vocabulary for incremental training. diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index ab51125e1413..d1709ca4ccdb 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -156,10 +156,9 @@ def _get_num_additional_slots(self) -> int: # to retrain from scratch very often, the default number # of additional slots is kept to MIN_ADDITIONAL_SLOTS. # This is an empirically tuned number. - # self.number_additional_patterns = max( - # MIN_ADDITIONAL_REGEX_PATTERNS, len(self.known_patterns) * 2 - # ) - self.number_additional_patterns = 0 + self.number_additional_patterns = max( + MIN_ADDITIONAL_REGEX_PATTERNS, len(self.known_patterns) * 2 + ) return self.number_additional_patterns def train( From a868b0efd301fbc3d28640db3f15bc28da897617 Mon Sep 17 00:00:00 2001 From: Daksh Date: Fri, 4 Dec 2020 13:16:50 +0100 Subject: [PATCH 11/45] add default minimum for cvf as well --- rasa/nlu/constants.py | 1 + .../count_vectors_featurizer.py | 8 ++++++- .../test_count_vectors_featurizer.py | 22 ++++++++++++++----- .../nlu/featurizers/test_regex_featurizer.py | 12 ++++------ 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py index d3a5b1ecec5c..90dc50e07520 100644 --- a/rasa/nlu/constants.py +++ b/rasa/nlu/constants.py @@ -79,3 +79,4 @@ NO_LENGTH_RESTRICTION = -1 MIN_ADDITIONAL_REGEX_PATTERNS = 10 +MIN_ADDITIONAL_CVF_VOCABULARY = 1000 diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 5c1318b6f6ba..b7d92678d6c0 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -21,6 +21,7 @@ MESSAGE_ATTRIBUTES, DENSE_FEATURIZABLE_ATTRIBUTES, FEATURIZER_CLASS_ALIAS, + MIN_ADDITIONAL_CVF_VOCABULARY, ) from rasa.shared.nlu.constants import ( TEXT, @@ -426,7 +427,12 @@ def _get_additional_vocabulary_size( return 0 if self.additional_vocabulary_size.get(attribute) is not None: return self.additional_vocabulary_size[attribute] - return int(existing_vocabulary_size * 0.5) + + # If the user hasn't defined additional vocabulary size, + # then we increase it by 1000 minimum. If the current + # vocabulary size is greater than 2000, we take half of + # that number as additional vocabulary size. + return max(MIN_ADDITIONAL_CVF_VOCABULARY, int(existing_vocabulary_size * 0.5)) def _add_buffer_to_vocabulary(self, attribute: Text) -> None: """Add extra tokens to vocabulary for incremental training. diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index 7aa5104e0eed..a46acdcb61c0 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -568,7 +568,12 @@ def test_count_vector_featurizer_action_attribute_featurization( action_name_features: np.ndarray, response_features: np.ndarray, ): - ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + ftr = CountVectorsFeaturizer( + { + "token_pattern": r"(?u)\b\w+\b", + "additional_vocabulary_size": {"text": 0, "response": 0, "action_text": 0}, + } + ) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) @@ -632,7 +637,12 @@ def test_count_vector_featurizer_process_by_attribute( action_name_features: np.ndarray, response_features: np.ndarray, ): - ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) + ftr = CountVectorsFeaturizer( + { + "token_pattern": r"(?u)\b\w+\b", + "additional_vocabulary_size": {"text": 0, "response": 0, "action_text": 0}, + } + ) tk = WhitespaceTokenizer() # add a second example that has some response, so that the vocabulary for @@ -669,7 +679,7 @@ def test_count_vector_featurizer_process_by_attribute( @pytest.mark.parametrize( "additional_size, text, real_vocabulary_size, total_vocabulary_size", - [(None, "hello my name is John.", 5, 7), (10, "hello my name is John.", 5, 15)], + [(None, "hello my name is John.", 5, 1005), (10, "hello my name is John.", 5, 15)], ) def test_cvf_independent_train_vocabulary_expand( additional_size: Optional[int], @@ -719,7 +729,7 @@ def test_cvf_independent_train_vocabulary_expand( @pytest.mark.parametrize( "additional_size, text, real_vocabulary_size, total_vocabulary_size", - [(None, "hello my name is John.", 7, 10), (10, "hello my name is John.", 7, 17)], + [(None, "hello my name is John.", 7, 1007), (10, "hello my name is John.", 7, 17)], ) def test_cvf_shared_train_vocabulary_expand( additional_size: Optional[int], @@ -766,7 +776,7 @@ def test_cvf_shared_train_vocabulary_expand( "additional_size, original_train_text, additional_train_text, total_vocabulary_size, remaining_buffer_size", [ (10, "hello my name is John.", "I am also new.", 15, 6), - (None, "hello my name is John.", "I am also new.", 7, 0), + (None, "hello my name is John.", "I am also new.", 1005, 996), ], ) def test_cvf_incremental_train_vocabulary( @@ -831,7 +841,7 @@ def test_cvf_incremental_train_vocabulary( "additional_size, original_train_text, additional_train_text, overflow", [ (10, "hello my name is John.", "I am also new.", False), - (None, "hello my name is John.", "I am also new.", True), + (None, "hello my name is John.", "I am also new.", False), (3, "hello my name is John.", "I am also new.", True), ], ) diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index 81b0c0cc331b..4c5277bc7af6 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -332,11 +332,9 @@ def test_persist_load(tmp_path: Path): } loaded_featurizer = RegexFeaturizer.load( - meta={ - "number_additional_patterns": 5, - "file": os.path.join(str(tmp_path), persist_value["file"]), - }, + meta={"number_additional_patterns": 5, "file": persist_value["file"],}, should_finetune=True, + model_dir=str(tmp_path), ) # Test component loaded in finetune mode and also with @@ -398,11 +396,9 @@ def test_incremental_train_featurization(tmp_path: Path): persist_value = featurizer.persist("ftr", str(tmp_path)) loaded_featurizer = RegexFeaturizer.load( - meta={ - "number_additional_patterns": 5, - "file": os.path.join(str(tmp_path), persist_value["file"]), - }, + meta={"number_additional_patterns": 5, "file": persist_value["file"],}, should_finetune=True, + model_dir=str(tmp_path), ) new_patterns = [ From eff925fa294b2d7d43cd1c3a4ac83911ba0d5b64 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Fri, 4 Dec 2020 17:11:17 +0100 Subject: [PATCH 12/45] Load core model in fine-tuning mode --- rasa/core/agent.py | 12 +++++++- rasa/core/policies/ensemble.py | 35 ++++++++++++++++++++---- rasa/core/policies/fallback.py | 3 ++ rasa/core/policies/form_policy.py | 1 + rasa/core/policies/mapping_policy.py | 3 +- rasa/core/policies/memoization.py | 2 +- rasa/core/policies/policy.py | 26 +++++++++++++++++- rasa/core/policies/rule_policy.py | 3 ++ rasa/core/policies/sklearn_policy.py | 23 +++++++++++++++- rasa/core/policies/ted_policy.py | 12 +++++++- rasa/core/policies/two_stage_fallback.py | 2 ++ rasa/core/train.py | 3 +- rasa/train.py | 30 ++++++++++++-------- tests/core/conftest.py | 2 +- tests/core/test_ensemble.py | 1 + 15 files changed, 132 insertions(+), 26 deletions(-) diff --git a/rasa/core/agent.py b/rasa/core/agent.py index 8c8e4164d8a0..28e5873e7d9f 100644 --- a/rasa/core/agent.py +++ b/rasa/core/agent.py @@ -411,6 +411,8 @@ def load( model_server: Optional[EndpointConfig] = None, remote_storage: Optional[Text] = None, path_to_model_archive: Optional[Text] = None, + new_config: Optional[Dict] = None, + finetuning_epoch_fraction: float = 1.0, ) -> "Agent": """Load a persisted model from the passed path.""" try: @@ -441,7 +443,15 @@ def load( if core_model: domain = Domain.load(os.path.join(core_model, DEFAULT_DOMAIN_PATH)) - ensemble = PolicyEnsemble.load(core_model) if core_model else None + ensemble = ( + PolicyEnsemble.load( + core_model, + new_config=new_config, + finetuning_epoch_fraction=finetuning_epoch_fraction, + ) + if core_model + else None + ) # ensures the domain hasn't changed between test and train domain.compare_with_specification(core_model) diff --git a/rasa/core/policies/ensemble.py b/rasa/core/policies/ensemble.py index e7b72d8778c1..2b5d7d14fd04 100644 --- a/rasa/core/policies/ensemble.py +++ b/rasa/core/policies/ensemble.py @@ -1,12 +1,13 @@ import importlib import json import logging +from math import ceil import os import sys from collections import defaultdict from datetime import datetime from pathlib import Path -from typing import Text, Optional, Any, List, Dict, Tuple, NamedTuple, Union +from typing import Text, Optional, Any, List, Dict, Tuple, Union import rasa.core import rasa.core.training.training @@ -303,9 +304,13 @@ def _ensure_loaded_policy(cls, policy, policy_cls, policy_name: Text): ) @classmethod - def load(cls, path: Union[Text, Path]) -> "PolicyEnsemble": - """Loads policy and domain specification from storage""" - + def load( + cls, + path: Union[Text, Path], + new_config: Optional[Dict] = None, + finetuning_epoch_fraction: float = 1.0, + ) -> "PolicyEnsemble": + """Loads policy and domain specification from storage.""" metadata = cls.load_metadata(path) cls.ensure_model_compatibility(metadata) policies = [] @@ -313,7 +318,27 @@ def load(cls, path: Union[Text, Path]) -> "PolicyEnsemble": policy_cls = registry.policy_from_module_path(policy_name) dir_name = f"policy_{i}_{policy_cls.__name__}" policy_path = os.path.join(path, dir_name) - policy = policy_cls.load(policy_path) + + if new_config: + if "should_finetune" not in rasa.shared.utils.common.arguments_of( + policy_cls.load + ): + raise UnsupportedDialogueModelError( + f"{policy_cls.__name__} does not support fine-tuning" + ) + + config_for_policy = new_config["policies"][i] + epochs = None + if "epochs" in config_for_policy: + epochs = ceil( + config_for_policy["epochs"] * finetuning_epoch_fraction + ) + policy = policy_cls.load( + policy_path, should_finetune=True, epoch_override=epochs + ) + else: + policy = policy_cls.load(policy_path) + cls._ensure_loaded_policy(policy, policy_cls, policy_name) policies.append(policy) ensemble_cls = rasa.shared.utils.common.class_from_module_path( diff --git a/rasa/core/policies/fallback.py b/rasa/core/policies/fallback.py index aeef4f834fe4..eb31863c9584 100644 --- a/rasa/core/policies/fallback.py +++ b/rasa/core/policies/fallback.py @@ -39,10 +39,12 @@ def __init__( ambiguity_threshold: float = DEFAULT_NLU_FALLBACK_AMBIGUITY_THRESHOLD, core_threshold: float = DEFAULT_CORE_FALLBACK_THRESHOLD, fallback_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, + **kwargs: Any, ) -> None: """Create a new Fallback policy. Args: + priority: Fallback policy priority. core_threshold: if NLU confidence threshold is met, predict fallback action with confidence `core_threshold`. If this is the highest confidence in the ensemble, @@ -53,6 +55,7 @@ def __init__( ambiguity_threshold: threshold for minimum difference between confidences of the top two predictions fallback_action_name: name of the action to execute as a fallback + """ super().__init__(priority=priority) diff --git a/rasa/core/policies/form_policy.py b/rasa/core/policies/form_policy.py index ba75d8e20421..3fbd8d29f959 100644 --- a/rasa/core/policies/form_policy.py +++ b/rasa/core/policies/form_policy.py @@ -35,6 +35,7 @@ def __init__( featurizer: Optional[TrackerFeaturizer] = None, priority: int = FORM_POLICY_PRIORITY, lookup: Optional[Dict] = None, + **kwargs: Any, ) -> None: # max history is set to 2 in order to capture diff --git a/rasa/core/policies/mapping_policy.py b/rasa/core/policies/mapping_policy.py index 3bfa83c60d42..5ecb0503975c 100644 --- a/rasa/core/policies/mapping_policy.py +++ b/rasa/core/policies/mapping_policy.py @@ -43,9 +43,8 @@ class MappingPolicy(Policy): def _standard_featurizer() -> None: return None - def __init__(self, priority: int = MAPPING_POLICY_PRIORITY) -> None: + def __init__(self, priority: int = MAPPING_POLICY_PRIORITY, **kwargs: Any) -> None: """Create a new Mapping policy.""" - super().__init__(priority=priority) rasa.shared.utils.io.raise_deprecation_warning( diff --git a/rasa/core/policies/memoization.py b/rasa/core/policies/memoization.py index 55876c4a07af..47a8f6510a96 100644 --- a/rasa/core/policies/memoization.py +++ b/rasa/core/policies/memoization.py @@ -71,6 +71,7 @@ def __init__( priority: int = MEMOIZATION_POLICY_PRIORITY, max_history: Optional[int] = MAX_HISTORY_NOT_SET, lookup: Optional[Dict] = None, + **kwargs, ) -> None: """Initialize the policy. @@ -81,7 +82,6 @@ def __init__( lookup: a dictionary that stores featurized tracker states and predicted actions for them """ - if max_history == MAX_HISTORY_NOT_SET: max_history = OLD_DEFAULT_MAX_HISTORY # old default value rasa.shared.utils.io.raise_warning( diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index 8b54d84b9184..dcbda42582b9 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -16,6 +16,8 @@ TYPE_CHECKING, ) import numpy as np + +from rasa.core.exceptions import UnsupportedDialogueModelError from rasa.shared.core.events import Event import rasa.shared.utils.common @@ -110,6 +112,7 @@ def __init__( self, featurizer: Optional[TrackerFeaturizer] = None, priority: int = DEFAULT_POLICY_PRIORITY, + **kwargs: Any, ) -> None: self.__featurizer = self._create_featurizer(featurizer) self.priority = priority @@ -272,11 +275,19 @@ def persist(self, path: Union[Text, Path]) -> None: rasa.shared.utils.io.dump_obj_as_json_to_file(file, self._metadata()) @classmethod - def load(cls, path: Union[Text, Path]) -> "Policy": + def load( + cls, + path: Union[Text, Path], + should_finetune: bool = False, + epoch_override: Optional[float] = None, + ) -> "Policy": """Loads a policy from path. Args: path: Path to load policy from. + should_finetune: Indicates if the model components will be fine-tuned. + epoch_override: Optionally override the number of epochs + for the loaded model. Returns: An instance of `Policy`. @@ -285,11 +296,24 @@ def load(cls, path: Union[Text, Path]) -> "Policy": if metadata_file.is_file(): data = json.loads(rasa.shared.utils.io.read_file(metadata_file)) + data["should_finetune"] = should_finetune + if epoch_override: + data["epochs"] = epoch_override if (Path(path) / FEATURIZER_FILE).is_file(): featurizer = TrackerFeaturizer.load(path) data["featurizer"] = featurizer + # TODO: figure out this mess + constructure_args = rasa.shared.utils.common.arguments_of(cls) + if "kwargs" not in constructure_args and ( + (should_finetune and "should_finetune" not in constructure_args) + or (epoch_override or "epochs" not in constructure_args) + ): + raise UnsupportedDialogueModelError( + f"{cls.__name__} does not support fine-tuning" + ) + return cls(**data) logger.info( diff --git a/rasa/core/policies/rule_policy.py b/rasa/core/policies/rule_policy.py index c13086ec44af..744e13898593 100644 --- a/rasa/core/policies/rule_policy.py +++ b/rasa/core/policies/rule_policy.py @@ -108,6 +108,7 @@ def __init__( enable_fallback_prediction: bool = True, restrict_rules: bool = True, check_for_contradictions: bool = True, + **kwargs: Any, ) -> None: """Create a `RulePolicy` object. @@ -124,6 +125,8 @@ def __init__( if no rule matched. enable_fallback_prediction: If `True` `core_fallback_action_name` is predicted in case no rule matched. + restrict_rules: Restrict rules. TODO: better descriptions. + check_for_contradictions: Check for contradictions. """ self._core_fallback_threshold = core_fallback_threshold self._fallback_action_name = core_fallback_action_name diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py index ef616fcdc208..27c1b9c4536f 100644 --- a/rasa/core/policies/sklearn_policy.py +++ b/rasa/core/policies/sklearn_policy.py @@ -302,7 +302,23 @@ def persist(self, path: Union[Text, Path]) -> None: ) @classmethod - def load(cls, path: Union[Text, Path]) -> Policy: + def load( + cls, + path: Union[Text, Path], + should_finetune: bool = False, + epoch_override: Optional[float] = None, + ) -> Policy: + """Load the policy from path. + + Args: + path: Path to load policy from. + should_finetune: Indicates if the model components will be fine-tuned. + epoch_override: Optionally override the number of epochs + for the loaded model. + + Returns: + An instance of `SklearnPolicy`. + """ filename = Path(path) / "sklearn_model.pkl" zero_features_filename = Path(path) / "zero_state_features.pkl" if not Path(path).exists(): @@ -321,10 +337,15 @@ def load(cls, path: Union[Text, Path]) -> Policy: meta = json.loads(rasa.shared.utils.io.read_file(meta_file)) zero_state_features = io_utils.pickle_load(zero_features_filename) + data = {"should_finetune": should_finetune} + if epoch_override: + data["epochs"] = epoch_override + policy = cls( featurizer=featurizer, priority=meta["priority"], zero_state_features=zero_state_features, + **data, ) state = io_utils.pickle_load(filename) diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index 7a7b60352327..4b946d4eeda6 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -437,8 +437,14 @@ def persist(self, path: Union[Text, Path]) -> None: ) @classmethod - def load(cls, path: Union[Text, Path]) -> "TEDPolicy": + def load( + cls, + path: Union[Text, Path], + should_finetune: bool = False, + epoch_override: Optional[float] = None, + ) -> "TEDPolicy": """Loads a policy from the storage. + **Needs to load its featurizer** """ model_path = Path(path) @@ -500,6 +506,10 @@ def load(cls, path: Union[Text, Path]) -> "TEDPolicy": ) model.build_for_predict(predict_data_example) + meta["should_finetune"] = should_finetune + if epoch_override: + meta["epochs"] = epoch_override + return cls( featurizer=featurizer, priority=priority, diff --git a/rasa/core/policies/two_stage_fallback.py b/rasa/core/policies/two_stage_fallback.py index 92b2a5b43445..3ebf7db38954 100644 --- a/rasa/core/policies/two_stage_fallback.py +++ b/rasa/core/policies/two_stage_fallback.py @@ -62,10 +62,12 @@ def __init__( fallback_core_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, fallback_nlu_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, deny_suggestion_intent_name: Text = USER_INTENT_OUT_OF_SCOPE, + **kwargs: Any, ) -> None: """Create a new Two-stage Fallback policy. Args: + priority: The fallback policy priority. nlu_threshold: minimum threshold for NLU confidence. If intent prediction confidence is lower than this, predict fallback action with confidence 1.0. diff --git a/rasa/core/train.py b/rasa/core/train.py index c3c109f2039e..44137f6bbd00 100644 --- a/rasa/core/train.py +++ b/rasa/core/train.py @@ -31,7 +31,6 @@ async def train( exclusion_percentage: Optional[int] = None, additional_arguments: Optional[Dict] = None, model_to_finetune: Optional["Agent"] = None, - finetuning_epoch_fraction: float = 1.0, ) -> "Agent": from rasa.core import config, utils from rasa.core.utils import AvailableEndpoints @@ -66,6 +65,8 @@ async def train( training_data = await agent.load_data( training_resource, exclusion_percentage=exclusion_percentage, **data_load_args ) + if model_to_finetune: + agent.policy_ensemble = model_to_finetune.policy_ensemble agent.train(training_data, **additional_arguments) agent.persist(output_path) diff --git a/rasa/train.py b/rasa/train.py index dcc3f428b8ea..8a726223e814 100644 --- a/rasa/train.py +++ b/rasa/train.py @@ -446,7 +446,11 @@ async def _train_core_with_validated_data( ) if model_to_finetune: - model_to_finetune = _core_model_for_finetuning(model_to_finetune) + model_to_finetune = _core_model_for_finetuning( + model_to_finetune, + new_config=config, + finetuning_epoch_fraction=finetuning_epoch_fraction, + ) if not model_to_finetune: rasa.shared.utils.cli.print_warning( @@ -468,7 +472,6 @@ async def _train_core_with_validated_data( additional_arguments=additional_arguments, interpreter=interpreter, model_to_finetune=model_to_finetune, - finetuning_epoch_fraction=finetuning_epoch_fraction, ) rasa.shared.utils.cli.print_color( "Core model training completed.", color=rasa.shared.utils.io.bcolors.OKBLUE @@ -488,21 +491,24 @@ async def _train_core_with_validated_data( return _train_path -def _core_model_for_finetuning(model_to_finetune: Text) -> Optional[Agent]: +def _core_model_for_finetuning( + model_to_finetune: Text, + new_config: Optional[Dict] = None, + finetuning_epoch_fraction: float = 1.0, +) -> Optional[Agent]: path_to_archive = model.get_model_for_finetuning(model_to_finetune) if not path_to_archive: return None with model.unpack_model(path_to_archive) as unpacked: - try: - agent = Agent.load(unpacked) - # Agent might be empty if no underlying Core model was found. - if agent.domain is not None and agent.policy_ensemble is not None: - return agent - except Exception: - # Anything might go wrong. In that case we skip model finetuning. - pass - return None + agent = Agent.load( + unpacked, + new_config=new_config, + finetuning_epoch_fraction=finetuning_epoch_fraction, + ) + # Agent might be empty if no underlying Core model was found. + if agent.domain is not None and agent.policy_ensemble is not None: + return agent def train_nlu( diff --git a/tests/core/conftest.py b/tests/core/conftest.py index b230a4170b35..f27dda74ded9 100644 --- a/tests/core/conftest.py +++ b/tests/core/conftest.py @@ -72,7 +72,7 @@ def as_feature(self): # noinspection PyAbstractClass,PyUnusedLocal,PyMissingConstructor class ExamplePolicy(Policy): - def __init__(self, example_arg): + def __init__(self, example_arg, **kwargs): super(ExamplePolicy, self).__init__() diff --git a/tests/core/test_ensemble.py b/tests/core/test_ensemble.py index 7f90f2591e93..d5ca949211f6 100644 --- a/tests/core/test_ensemble.py +++ b/tests/core/test_ensemble.py @@ -83,6 +83,7 @@ def __init__( is_end_to_end_prediction: bool = False, events: Optional[List[Event]] = None, optional_events: Optional[List[Event]] = None, + **kwargs: Any, ) -> None: super().__init__(priority=priority) self.predict_index = predict_index From 92526a244a6f7f96d70c2e5060b5a22e2fb42f35 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Mon, 7 Dec 2020 15:49:48 +0100 Subject: [PATCH 13/45] Core finetune loading test --- tests/test_train.py | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/tests/test_train.py b/tests/test_train.py index d5aa2eff8bf7..fa8e77a55d4b 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -3,12 +3,13 @@ import os from pathlib import Path from typing import Text, Dict, Any -from unittest.mock import Mock +from unittest.mock import Mock, create_autospec import pytest from _pytest.capture import CaptureFixture from _pytest.monkeypatch import MonkeyPatch +from rasa.core.policies.ted_policy import TEDPolicy import rasa.model import rasa.core import rasa.nlu @@ -18,6 +19,7 @@ from rasa.nlu.model import Interpreter from rasa.train import train_core, train_nlu, train +from rasa.utils.tensorflow.constants import EPOCHS from tests.conftest import DEFAULT_CONFIG_PATH, DEFAULT_NLU_DATA, AsyncMock from tests.core.conftest import DEFAULT_DOMAIN_PATH_WITH_SLOTS, DEFAULT_STORIES_FILE from tests.test_model import _fingerprint @@ -396,32 +398,47 @@ def test_model_finetuning_core( tmp_path: Path, monkeypatch: MonkeyPatch, default_domain_path: Text, - default_stories_file: Text, - default_stack_config: Text, - trained_rasa_model: Text, + default_nlu_data: Text, + trained_moodbot_path: Text, use_latest_model: bool, ): mocked_core_training = AsyncMock() monkeypatch.setattr(rasa.core, rasa.core.train.__name__, mocked_core_training) + mock_agent_load = Mock(wraps=Agent.load) + monkeypatch.setattr(Agent, "load", mock_agent_load) + (tmp_path / "models").mkdir() output = str(tmp_path / "models") if use_latest_model: - trained_rasa_model = str(Path(trained_rasa_model).parent) + trained_moodbot_path = str(Path(trained_moodbot_path).parent) + + # Typically models will be fine-tuned with a smaller number of epochs than training + # from scratch. + # Fine-tuning will use the number of epochs in the new config. + old_config = rasa.shared.utils.io.read_yaml_file("examples/moodbot/config.yml") + old_config["policies"][0]["epochs"] = 20 + new_config_path = tmp_path / "new_config.yml" + rasa.shared.utils.io.write_yaml(old_config, new_config_path) train_core( - default_domain_path, - default_stack_config, - default_stories_file, + "examples/moodbot/domain.yml", + str(new_config_path), + "examples/moodbot/data/stories.yml", output=output, - model_to_finetune=trained_rasa_model, - finetuning_epoch_fraction=1, + model_to_finetune=trained_moodbot_path, + finetuning_epoch_fraction=0.5, ) mocked_core_training.assert_called_once() _, kwargs = mocked_core_training.call_args - assert isinstance(kwargs["model_to_finetune"], Agent) + model_to_finetune = kwargs["model_to_finetune"] + assert isinstance(model_to_finetune, Agent) + + ted_config = model_to_finetune.policy_ensemble.policies[0].config + assert ted_config[EPOCHS] == 10 + assert ted_config["should_finetune"] is True @pytest.mark.parametrize("use_latest_model", [True, False]) From 3be4ea907e5218549f5e3437d3677ad64d0e170e Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Mon, 7 Dec 2020 16:27:53 +0100 Subject: [PATCH 14/45] Test and PR comments --- rasa/core/policies/ensemble.py | 22 ++++++++++++++------- rasa/core/policies/fallback.py | 5 +++-- rasa/core/policies/form_policy.py | 8 ++++++-- rasa/core/policies/mapping_policy.py | 6 ++++-- rasa/core/policies/memoization.py | 5 +++-- rasa/core/policies/policy.py | 25 ++++++++++++------------ rasa/core/policies/rule_policy.py | 13 +++++++++--- rasa/core/policies/sklearn_policy.py | 19 ++++++------------ rasa/core/policies/ted_policy.py | 4 ++-- rasa/core/policies/two_stage_fallback.py | 4 +++- rasa/train.py | 2 ++ tests/test_train.py | 6 +++--- 12 files changed, 70 insertions(+), 49 deletions(-) diff --git a/rasa/core/policies/ensemble.py b/rasa/core/policies/ensemble.py index 2b5d7d14fd04..8e1a94f23635 100644 --- a/rasa/core/policies/ensemble.py +++ b/rasa/core/policies/ensemble.py @@ -1,7 +1,7 @@ import importlib import json import logging -from math import ceil +import math import os import sys from collections import defaultdict @@ -303,6 +303,13 @@ def _ensure_loaded_policy(cls, policy, policy_cls, policy_name: Text): "".format(policy_name) ) + @staticmethod + def _get_updated_epochs(config_for_policy, finetuning_epoch_fraction): + epochs = None + if "epochs" in config_for_policy: + epochs = math.ceil(config_for_policy["epochs"] * finetuning_epoch_fraction) + return epochs + @classmethod def load( cls, @@ -324,15 +331,16 @@ def load( policy_cls.load ): raise UnsupportedDialogueModelError( - f"{policy_cls.__name__} does not support fine-tuning" + f"{policy_cls.__name__} does not support fine-tuning. " + f"To support fine-tuning the `load` method must have the " + f"argument `should_finetune`. This argument should be added to " + f"all policies by Rasa Open Source 3.0.0." ) config_for_policy = new_config["policies"][i] - epochs = None - if "epochs" in config_for_policy: - epochs = ceil( - config_for_policy["epochs"] * finetuning_epoch_fraction - ) + epochs = cls._get_updated_epochs( + config_for_policy, finetuning_epoch_fraction + ) policy = policy_cls.load( policy_path, should_finetune=True, epoch_override=epochs ) diff --git a/rasa/core/policies/fallback.py b/rasa/core/policies/fallback.py index eb31863c9584..5b7f9d19febb 100644 --- a/rasa/core/policies/fallback.py +++ b/rasa/core/policies/fallback.py @@ -39,7 +39,7 @@ def __init__( ambiguity_threshold: float = DEFAULT_NLU_FALLBACK_AMBIGUITY_THRESHOLD, core_threshold: float = DEFAULT_CORE_FALLBACK_THRESHOLD, fallback_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, - **kwargs: Any, + should_finetune: bool = False, ) -> None: """Create a new Fallback policy. @@ -55,9 +55,10 @@ def __init__( ambiguity_threshold: threshold for minimum difference between confidences of the top two predictions fallback_action_name: name of the action to execute as a fallback + should_finetune: Indicates if the model components will be fine-tuned. """ - super().__init__(priority=priority) + super().__init__(priority=priority, should_finetune=should_finetune) self.nlu_threshold = nlu_threshold self.ambiguity_threshold = ambiguity_threshold diff --git a/rasa/core/policies/form_policy.py b/rasa/core/policies/form_policy.py index 3fbd8d29f959..139584870841 100644 --- a/rasa/core/policies/form_policy.py +++ b/rasa/core/policies/form_policy.py @@ -35,13 +35,17 @@ def __init__( featurizer: Optional[TrackerFeaturizer] = None, priority: int = FORM_POLICY_PRIORITY, lookup: Optional[Dict] = None, - **kwargs: Any, + should_finetune: bool = False, ) -> None: # max history is set to 2 in order to capture # previous meaningful action before action listen super().__init__( - featurizer=featurizer, priority=priority, max_history=2, lookup=lookup + featurizer=featurizer, + priority=priority, + max_history=2, + lookup=lookup, + should_finetune=should_finetune, ) rasa.shared.utils.io.raise_deprecation_warning( diff --git a/rasa/core/policies/mapping_policy.py b/rasa/core/policies/mapping_policy.py index 5ecb0503975c..039ae9287d28 100644 --- a/rasa/core/policies/mapping_policy.py +++ b/rasa/core/policies/mapping_policy.py @@ -43,9 +43,11 @@ class MappingPolicy(Policy): def _standard_featurizer() -> None: return None - def __init__(self, priority: int = MAPPING_POLICY_PRIORITY, **kwargs: Any) -> None: + def __init__( + self, priority: int = MAPPING_POLICY_PRIORITY, should_finetune: bool = False, + ) -> None: """Create a new Mapping policy.""" - super().__init__(priority=priority) + super().__init__(priority=priority, should_finetune=should_finetune) rasa.shared.utils.io.raise_deprecation_warning( f"'{MappingPolicy.__name__}' is deprecated and will be removed in " diff --git a/rasa/core/policies/memoization.py b/rasa/core/policies/memoization.py index 47a8f6510a96..113c9feb541f 100644 --- a/rasa/core/policies/memoization.py +++ b/rasa/core/policies/memoization.py @@ -71,7 +71,7 @@ def __init__( priority: int = MEMOIZATION_POLICY_PRIORITY, max_history: Optional[int] = MAX_HISTORY_NOT_SET, lookup: Optional[Dict] = None, - **kwargs, + should_finetune: bool = False, ) -> None: """Initialize the policy. @@ -81,6 +81,7 @@ def __init__( max_history: maximum history to take into account when featurizing trackers lookup: a dictionary that stores featurized tracker states and predicted actions for them + should_finetune: Indicates if the model components will be fine-tuned. """ if max_history == MAX_HISTORY_NOT_SET: max_history = OLD_DEFAULT_MAX_HISTORY # old default value @@ -97,7 +98,7 @@ def __init__( if not featurizer: featurizer = self._standard_featurizer(max_history) - super().__init__(featurizer, priority) + super().__init__(featurizer, priority, should_finetune=should_finetune) self.max_history = self.featurizer.max_history self.lookup = lookup if lookup is not None else {} diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index dcbda42582b9..28fb2492bb7a 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -112,13 +112,15 @@ def __init__( self, featurizer: Optional[TrackerFeaturizer] = None, priority: int = DEFAULT_POLICY_PRIORITY, - **kwargs: Any, + should_finetune: bool = False, ) -> None: self.__featurizer = self._create_featurizer(featurizer) self.priority = priority + self.should_finetune = should_finetune @property def featurizer(self): + """Returns the policy's featurizer.""" return self.__featurizer @staticmethod @@ -296,7 +298,16 @@ def load( if metadata_file.is_file(): data = json.loads(rasa.shared.utils.io.read_file(metadata_file)) - data["should_finetune"] = should_finetune + if should_finetune: + if "should_finetune" not in rasa.shared.utils.common.arguments_of(cls): + raise UnsupportedDialogueModelError( + f"{cls.__name__} does not support fine-tuning. " + f"To support fine-tuning the `__init__` method must have the " + f"argument `should_finetune`. This argument should be added to " + f"all policies by Rasa Open Source 3.0.0." + ) + data["should_finetune"] = should_finetune + if epoch_override: data["epochs"] = epoch_override @@ -304,16 +315,6 @@ def load( featurizer = TrackerFeaturizer.load(path) data["featurizer"] = featurizer - # TODO: figure out this mess - constructure_args = rasa.shared.utils.common.arguments_of(cls) - if "kwargs" not in constructure_args and ( - (should_finetune and "should_finetune" not in constructure_args) - or (epoch_override or "epochs" not in constructure_args) - ): - raise UnsupportedDialogueModelError( - f"{cls.__name__} does not support fine-tuning" - ) - return cls(**data) logger.info( diff --git a/rasa/core/policies/rule_policy.py b/rasa/core/policies/rule_policy.py index 744e13898593..beb4d322d2cc 100644 --- a/rasa/core/policies/rule_policy.py +++ b/rasa/core/policies/rule_policy.py @@ -108,7 +108,7 @@ def __init__( enable_fallback_prediction: bool = True, restrict_rules: bool = True, check_for_contradictions: bool = True, - **kwargs: Any, + should_finetune: bool = False, ) -> None: """Create a `RulePolicy` object. @@ -125,8 +125,11 @@ def __init__( if no rule matched. enable_fallback_prediction: If `True` `core_fallback_action_name` is predicted in case no rule matched. - restrict_rules: Restrict rules. TODO: better descriptions. + restrict_rules: If `True` rules are restricted to contain a maximum of 1 + user message. This is used to avoid that users build a state machine + using the rules. check_for_contradictions: Check for contradictions. + should_finetune: Indicates if the model components will be fine-tuned. """ self._core_fallback_threshold = core_fallback_threshold self._fallback_action_name = core_fallback_action_name @@ -139,7 +142,11 @@ def __init__( # max history is set to `None` in order to capture any lengths of rule stories super().__init__( - featurizer=featurizer, priority=priority, max_history=None, lookup=lookup + featurizer=featurizer, + priority=priority, + max_history=None, + lookup=lookup, + should_finetune=should_finetune, ) @classmethod diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py index 27c1b9c4536f..de73592d31d1 100644 --- a/rasa/core/policies/sklearn_policy.py +++ b/rasa/core/policies/sklearn_policy.py @@ -65,6 +65,7 @@ def __init__( label_encoder: LabelEncoder = LabelEncoder(), shuffle: bool = True, zero_state_features: Optional[Dict[Text, List["Features"]]] = None, + should_finetune: bool = False, **kwargs: Any, ) -> None: """Create a new sklearn policy. @@ -72,6 +73,8 @@ def __init__( Args: featurizer: Featurizer used to convert the training data into vector format. + priority: Policy priority + max_history: Maximum history of the dialogs. model: The sklearn model or model pipeline. param_grid: If *param_grid* is not None and *cv* is given, a grid search on the given *param_grid* is performed @@ -84,8 +87,8 @@ def __init__( *inverse_transform* method. shuffle: Whether to shuffle training data. zero_state_features: Contains default feature values for attributes + should_finetune: Indicates if the model components will be fine-tuned. """ - if featurizer: if not isinstance(featurizer, MaxHistoryTrackerFeaturizer): raise TypeError( @@ -104,7 +107,7 @@ def __init__( ) featurizer = self._standard_featurizer(max_history) - super().__init__(featurizer, priority) + super().__init__(featurizer, priority, should_finetune=should_finetune) self.model = model or self._default_model() self.cv = cv @@ -308,17 +311,7 @@ def load( should_finetune: bool = False, epoch_override: Optional[float] = None, ) -> Policy: - """Load the policy from path. - - Args: - path: Path to load policy from. - should_finetune: Indicates if the model components will be fine-tuned. - epoch_override: Optionally override the number of epochs - for the loaded model. - - Returns: - An instance of `SklearnPolicy`. - """ + """See the docstring for `Policy.Load`.""" filename = Path(path) / "sklearn_model.pkl" zero_features_filename = Path(path) / "zero_state_features.pkl" if not Path(path).exists(): diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index 4b946d4eeda6..aa3dd130f8a5 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -217,14 +217,14 @@ def __init__( max_history: Optional[int] = None, model: Optional[RasaModel] = None, zero_state_features: Optional[Dict[Text, List["Features"]]] = None, + should_finetune: bool = False, **kwargs: Any, ) -> None: """Declare instance variables with default values.""" - if not featurizer: featurizer = self._standard_featurizer(max_history) - super().__init__(featurizer, priority) + super().__init__(featurizer, priority, should_finetune=should_finetune) if isinstance(featurizer, FullDialogueTrackerFeaturizer): self.is_full_dialogue_featurizer_used = True else: diff --git a/rasa/core/policies/two_stage_fallback.py b/rasa/core/policies/two_stage_fallback.py index 3ebf7db38954..c8238189a280 100644 --- a/rasa/core/policies/two_stage_fallback.py +++ b/rasa/core/policies/two_stage_fallback.py @@ -62,7 +62,7 @@ def __init__( fallback_core_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, fallback_nlu_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, deny_suggestion_intent_name: Text = USER_INTENT_OUT_OF_SCOPE, - **kwargs: Any, + should_finetune: bool = False, ) -> None: """Create a new Two-stage Fallback policy. @@ -83,6 +83,7 @@ def __init__( denies the recognised intent for the second time. deny_suggestion_intent_name: The name of the intent which is used to detect that the user denies the suggested intents. + should_finetune: Indicates if the model components will be fine-tuned. """ super().__init__( priority, @@ -90,6 +91,7 @@ def __init__( ambiguity_threshold, core_threshold, fallback_core_action_name, + should_finetune=should_finetune, ) self.fallback_nlu_action_name = fallback_nlu_action_name diff --git a/rasa/train.py b/rasa/train.py index 8a726223e814..ac8225aa26ba 100644 --- a/rasa/train.py +++ b/rasa/train.py @@ -510,6 +510,8 @@ def _core_model_for_finetuning( if agent.domain is not None and agent.policy_ensemble is not None: return agent + return None + def train_nlu( config: Text, diff --git a/tests/test_train.py b/tests/test_train.py index fa8e77a55d4b..c903bef39e9c 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -436,9 +436,9 @@ def test_model_finetuning_core( model_to_finetune = kwargs["model_to_finetune"] assert isinstance(model_to_finetune, Agent) - ted_config = model_to_finetune.policy_ensemble.policies[0].config - assert ted_config[EPOCHS] == 10 - assert ted_config["should_finetune"] is True + ted = model_to_finetune.policy_ensemble.policies[0] + assert ted.config[EPOCHS] == 10 + assert ted.should_finetune is True @pytest.mark.parametrize("use_latest_model", [True, False]) From 5e8eeb4a793ae9479b5a1288ec7bfb7e728e3e68 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Mon, 7 Dec 2020 17:06:11 +0100 Subject: [PATCH 15/45] Fallback to default epochs --- rasa/core/policies/ensemble.py | 23 +++++++++++------ rasa/core/policies/policy.py | 4 ++- rasa/core/policies/sklearn_policy.py | 4 +-- rasa/core/policies/ted_policy.py | 2 +- tests/test_train.py | 37 ++++++++++++++++++++++++++++ 5 files changed, 59 insertions(+), 11 deletions(-) diff --git a/rasa/core/policies/ensemble.py b/rasa/core/policies/ensemble.py index 8e1a94f23635..83a9fc3abe16 100644 --- a/rasa/core/policies/ensemble.py +++ b/rasa/core/policies/ensemble.py @@ -7,7 +7,7 @@ from collections import defaultdict from datetime import datetime from pathlib import Path -from typing import Text, Optional, Any, List, Dict, Tuple, Union +from typing import Text, Optional, Any, List, Dict, Tuple, Type, Union import rasa.core import rasa.core.training.training @@ -42,6 +42,7 @@ from rasa.shared.core.trackers import DialogueStateTracker from rasa.shared.core.generator import TrackerWithCachedStates from rasa.core import registry +from rasa.utils.tensorflow.constants import EPOCHS logger = logging.getLogger(__name__) @@ -304,11 +305,19 @@ def _ensure_loaded_policy(cls, policy, policy_cls, policy_name: Text): ) @staticmethod - def _get_updated_epochs(config_for_policy, finetuning_epoch_fraction): - epochs = None - if "epochs" in config_for_policy: - epochs = math.ceil(config_for_policy["epochs"] * finetuning_epoch_fraction) - return epochs + def _get_updated_epochs( + policy_cls: Type[Policy], + config_for_policy: Dict[Text, Any], + finetuning_epoch_fraction: float, + ): + if EPOCHS in config_for_policy: + epochs = config_for_policy[EPOCHS] + else: + try: + epochs = policy_cls.defaults[EPOCHS] + except (KeyError, AttributeError): + return None + return math.ceil(epochs * finetuning_epoch_fraction) @classmethod def load( @@ -339,7 +348,7 @@ def load( config_for_policy = new_config["policies"][i] epochs = cls._get_updated_epochs( - config_for_policy, finetuning_epoch_fraction + policy_cls, config_for_policy, finetuning_epoch_fraction ) policy = policy_cls.load( policy_path, should_finetune=True, epoch_override=epochs diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index 28fb2492bb7a..6a45afc9b57f 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -36,6 +36,7 @@ from rasa.core.constants import DEFAULT_POLICY_PRIORITY from rasa.shared.core.constants import USER, SLOTS, PREVIOUS_ACTION, ACTIVE_LOOP from rasa.shared.nlu.constants import ENTITIES, INTENT, TEXT, ACTION_TEXT, ACTION_NAME +from rasa.utils.tensorflow.constants import EPOCHS if TYPE_CHECKING: from rasa.shared.nlu.training_data.features import Features @@ -114,6 +115,7 @@ def __init__( priority: int = DEFAULT_POLICY_PRIORITY, should_finetune: bool = False, ) -> None: + """Construct a new Policy object.""" self.__featurizer = self._create_featurizer(featurizer) self.priority = priority self.should_finetune = should_finetune @@ -309,7 +311,7 @@ def load( data["should_finetune"] = should_finetune if epoch_override: - data["epochs"] = epoch_override + data[EPOCHS] = epoch_override if (Path(path) / FEATURIZER_FILE).is_file(): featurizer = TrackerFeaturizer.load(path) diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py index de73592d31d1..eb8694a4b305 100644 --- a/rasa/core/policies/sklearn_policy.py +++ b/rasa/core/policies/sklearn_policy.py @@ -27,7 +27,7 @@ from sklearn.preprocessing import LabelEncoder from rasa.shared.nlu.constants import ACTION_TEXT, TEXT from rasa.shared.nlu.training_data.features import Features -from rasa.utils.tensorflow.constants import SENTENCE +from rasa.utils.tensorflow.constants import EPOCHS, SENTENCE from rasa.utils.tensorflow.model_data import Data # noinspection PyProtectedMember @@ -332,7 +332,7 @@ def load( data = {"should_finetune": should_finetune} if epoch_override: - data["epochs"] = epoch_override + data[EPOCHS] = epoch_override policy = cls( featurizer=featurizer, diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index aa3dd130f8a5..75c98a045c9b 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -508,7 +508,7 @@ def load( meta["should_finetune"] = should_finetune if epoch_override: - meta["epochs"] = epoch_override + meta[EPOCHS] = epoch_override return cls( featurizer=featurizer, diff --git a/tests/test_train.py b/tests/test_train.py index c903bef39e9c..bd64558f56e1 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -441,6 +441,43 @@ def test_model_finetuning_core( assert ted.should_finetune is True +def test_model_finetuning_core_with_default_epochs( + tmp_path: Path, + monkeypatch: MonkeyPatch, + default_domain_path: Text, + default_nlu_data: Text, + trained_moodbot_path: Text, +): + mocked_core_training = AsyncMock() + monkeypatch.setattr(rasa.core, rasa.core.train.__name__, mocked_core_training) + + (tmp_path / "models").mkdir() + output = str(tmp_path / "models") + + # Providing a new config with no epochs will mean the default amount are used + # and then scaled by `finetuning_epoch_fraction`. + old_config = rasa.shared.utils.io.read_yaml_file("examples/moodbot/config.yml") + del old_config["policies"][0]["epochs"] + new_config_path = tmp_path / "new_config.yml" + rasa.shared.utils.io.write_yaml(old_config, new_config_path) + + train_core( + "examples/moodbot/domain.yml", + str(new_config_path), + "examples/moodbot/data/stories.yml", + output=output, + model_to_finetune=trained_moodbot_path, + finetuning_epoch_fraction=2, + ) + + mocked_core_training.assert_called_once() + _, kwargs = mocked_core_training.call_args + model_to_finetune = kwargs["model_to_finetune"] + + ted = model_to_finetune.policy_ensemble.policies[0] + assert ted.config[EPOCHS] == TEDPolicy.defaults[EPOCHS] * 2 + + @pytest.mark.parametrize("use_latest_model", [True, False]) def test_model_finetuning_nlu( tmp_path: Path, From ab2058df1d30bce41a351d34a7a0110e94c83fe3 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Mon, 7 Dec 2020 17:53:47 +0100 Subject: [PATCH 16/45] Test policy and ensemble fine-tuning exception cases --- tests/core/conftest.py | 4 ++-- tests/core/test_ensemble.py | 24 ++++++++++++++++++++++-- tests/core/test_policies.py | 23 +++++++++++++++++++++++ 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/tests/core/conftest.py b/tests/core/conftest.py index f27dda74ded9..3fa2fa2ac965 100644 --- a/tests/core/conftest.py +++ b/tests/core/conftest.py @@ -72,8 +72,8 @@ def as_feature(self): # noinspection PyAbstractClass,PyUnusedLocal,PyMissingConstructor class ExamplePolicy(Policy): - def __init__(self, example_arg, **kwargs): - super(ExamplePolicy, self).__init__() + def __init__(self, example_arg, should_finetune=False): + super(ExamplePolicy, self).__init__(should_finetune=should_finetune) class MockedMongoTrackerStore(MongoTrackerStore): diff --git a/tests/core/test_ensemble.py b/tests/core/test_ensemble.py index d5ca949211f6..dabc4cc5d047 100644 --- a/tests/core/test_ensemble.py +++ b/tests/core/test_ensemble.py @@ -4,6 +4,7 @@ import pytest import copy +from rasa.core.exceptions import UnsupportedDialogueModelError from rasa.core.policies.memoization import MemoizationPolicy, AugmentedMemoizationPolicy from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter, RegexInterpreter @@ -74,6 +75,25 @@ def test_policy_loading_simple(tmp_path: Path): assert original_policy_ensemble.policies == loaded_policy_ensemble.policies +class NoFinetunePolicy(Policy): + @classmethod + def load(cls, _) -> Policy: + return NoFinetunePolicy() + + def persist(self, _) -> None: + pass + + +def test_policy_loading_unsupported_finetune(tmp_path: Path): + original_policy_ensemble = PolicyEnsemble([NoFinetunePolicy()]) + original_policy_ensemble.train([], None, RegexInterpreter()) + original_policy_ensemble.persist(str(tmp_path)) + + with pytest.raises(UnsupportedDialogueModelError) as execinfo: + PolicyEnsemble.load(str(tmp_path), new_config={"some": "new_config"}) + assert "NoFinetunePolicy does not support fine-tuning." in str(execinfo.value) + + class ConstantPolicy(Policy): def __init__( self, @@ -83,9 +103,9 @@ def __init__( is_end_to_end_prediction: bool = False, events: Optional[List[Event]] = None, optional_events: Optional[List[Event]] = None, - **kwargs: Any, + should_finetune: bool = False, ) -> None: - super().__init__(priority=priority) + super().__init__(priority=priority, should_finetune=should_finetune) self.predict_index = predict_index self.confidence = confidence self.is_end_to_end_prediction = is_end_to_end_prediction diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index b1a65f7e5d0f..de07a275c91c 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -7,8 +7,10 @@ from _pytest.monkeypatch import MonkeyPatch from rasa.core.channels import OutputChannel +from rasa.core.exceptions import UnsupportedDialogueModelError from rasa.core.nlg import NaturalLanguageGenerator from rasa.shared.core.generator import TrackerWithCachedStates +import rasa.shared.utils.io from rasa.core import training import rasa.core.actions.action @@ -1229,3 +1231,24 @@ def test_get_training_trackers_for_policy( def test_deprecation_warnings_for_old_rule_like_policies(policy: Type[Policy]): with pytest.warns(FutureWarning): policy(None) + + +class NoFinetunePolicy(Policy): + def __init__(self): + pass + + def persist(self, _) -> None: + pass + + @classmethod + def _metadata_filename(cls): + return "no_finetune_policy" + + +def test_loading_unsupported_policy_as_finetune(tmp_path: Path): + rasa.shared.utils.io.write_text_file( + "{}", tmp_path / NoFinetunePolicy._metadata_filename() + ) + with pytest.raises(UnsupportedDialogueModelError) as execinfo: + NoFinetunePolicy.load(str(tmp_path), should_finetune=True) + assert "NoFinetunePolicy does not support fine-tuning." in str(execinfo.value) From cbab2d0a080792256c8f215589ec7b62177d1f4f Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Mon, 7 Dec 2020 18:32:15 +0100 Subject: [PATCH 17/45] Remove epoch_override from Policy.load --- rasa/core/policies/ensemble.py | 20 +++++++++++++++++--- rasa/core/policies/policy.py | 12 +----------- tests/core/test_ensemble.py | 29 +++++++++++++++++++++++++++-- 3 files changed, 45 insertions(+), 16 deletions(-) diff --git a/rasa/core/policies/ensemble.py b/rasa/core/policies/ensemble.py index 83a9fc3abe16..7599deabedc2 100644 --- a/rasa/core/policies/ensemble.py +++ b/rasa/core/policies/ensemble.py @@ -350,9 +350,23 @@ def load( epochs = cls._get_updated_epochs( policy_cls, config_for_policy, finetuning_epoch_fraction ) - policy = policy_cls.load( - policy_path, should_finetune=True, epoch_override=epochs - ) + if epochs: + if "epoch_override" not in rasa.shared.utils.common.arguments_of( + policy_cls.load + ): + raise UnsupportedDialogueModelError( + f"{policy_cls.__name__} does not support fine-tuning. " + f"To support fine-tuning the `load` method must have the " + f"argument `epoch_override` if the policy uses epochs." + f"This argument should be added to all policies by " + f"Rasa Open Source 3.0.0." + ) + + policy = policy_cls.load( + policy_path, should_finetune=True, epoch_override=epochs + ) + else: + policy = policy_cls.load(policy_path, should_finetune=True) else: policy = policy_cls.load(policy_path) diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index 6a45afc9b57f..084584b24e4d 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -279,19 +279,12 @@ def persist(self, path: Union[Text, Path]) -> None: rasa.shared.utils.io.dump_obj_as_json_to_file(file, self._metadata()) @classmethod - def load( - cls, - path: Union[Text, Path], - should_finetune: bool = False, - epoch_override: Optional[float] = None, - ) -> "Policy": + def load(cls, path: Union[Text, Path], should_finetune: bool = False,) -> "Policy": """Loads a policy from path. Args: path: Path to load policy from. should_finetune: Indicates if the model components will be fine-tuned. - epoch_override: Optionally override the number of epochs - for the loaded model. Returns: An instance of `Policy`. @@ -310,9 +303,6 @@ def load( ) data["should_finetune"] = should_finetune - if epoch_override: - data[EPOCHS] = epoch_override - if (Path(path) / FEATURIZER_FILE).is_file(): featurizer = TrackerFeaturizer.load(path) data["featurizer"] = featurizer diff --git a/tests/core/test_ensemble.py b/tests/core/test_ensemble.py index dabc4cc5d047..1b85c6108fdf 100644 --- a/tests/core/test_ensemble.py +++ b/tests/core/test_ensemble.py @@ -1,6 +1,8 @@ from pathlib import Path -from typing import List, Any, Text, Optional +from typing import List, Any, Text, Optional, Union +from unittest.mock import Mock +from _pytest.monkeypatch import MonkeyPatch import pytest import copy @@ -77,7 +79,16 @@ def test_policy_loading_simple(tmp_path: Path): class NoFinetunePolicy(Policy): @classmethod - def load(cls, _) -> Policy: + def load(cls, path: Union[Text, Path]) -> Policy: + return NoFinetunePolicy() + + def persist(self, _) -> None: + pass + + +class NoFinetunePolicyEpochs(Policy): + @classmethod + def load(cls, path: Union[Text, Path], should_finetune: bool = False) -> Policy: return NoFinetunePolicy() def persist(self, _) -> None: @@ -94,6 +105,20 @@ def test_policy_loading_unsupported_finetune(tmp_path: Path): assert "NoFinetunePolicy does not support fine-tuning." in str(execinfo.value) +def test_policy_loading_unsupported_finetune_epochs( + tmp_path: Path, monkeypatch: MonkeyPatch +): + original_policy_ensemble = PolicyEnsemble([NoFinetunePolicyEpochs()]) + original_policy_ensemble.train([], None, RegexInterpreter()) + original_policy_ensemble.persist(str(tmp_path)) + monkeypatch.setattr(PolicyEnsemble, "_get_updated_epochs", Mock(return_value=10)) + + with pytest.raises(UnsupportedDialogueModelError) as execinfo: + PolicyEnsemble.load(str(tmp_path), new_config={"policies": [{}]}) + assert "NoFinetunePolicyEpochs does not support fine-tuning." in str(execinfo.value) + assert "if the policy uses epochs." in str(execinfo.value) + + class ConstantPolicy(Policy): def __init__( self, From 364c31c25adbaebb29ccd47350dcaa2ce4917ae0 Mon Sep 17 00:00:00 2001 From: Daksh Varshneya Date: Tue, 8 Dec 2020 00:44:09 +0100 Subject: [PATCH 18/45] Apply suggestions from code review Co-authored-by: Tobias Wochinger --- .../count_vectors_featurizer.py | 22 +++++++++---------- .../sparse_featurizer/regex_featurizer.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index b7d92678d6c0..70e4b3e16a73 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -164,7 +164,7 @@ def _load_vocabulary_params(self) -> None: ] = user_defined_additional_size def _check_attribute_vocabulary(self, attribute: Text) -> bool: - """Check if trained vocabulary exists in attribute's count vectorizer.""" + """Checks if trained vocabulary exists in attribute's count vectorizer.""" try: return hasattr(self.vectorizers[attribute], "vocabulary_") except (AttributeError, TypeError): @@ -390,7 +390,7 @@ def _update_vectorizer_vocabulary( f"which is larger than the maximum vocabulary size({len(existing_vocabulary)}) " f"of the original model. Some tokens will have to be dropped " f"in order to continue training. It is advised to re-train the " - f"model from scratch on complete data." + f"model from scratch on the complete data." ) for token in vocabulary: if token not in existing_vocabulary: @@ -404,9 +404,9 @@ def _update_vectorizer_vocabulary( def _get_additional_vocabulary_size( self, attribute: Text, existing_vocabulary_size: int ) -> int: - """Get additional vocabulary size to be saved for incremental training. + """Gets additional vocabulary size to be saved for incremental training. - If `self.additional_vocabulary_size` is not None, + If `self.additional_vocabulary_size` is not `None`, we return that as the user should have specified this number. If not then we take the default additional vocabulary size which is 1/2 of the @@ -473,14 +473,14 @@ def _set_vocabulary( def _construct_vocabulary_from_texts( vectorizer: CountVectorizer, texts: List[Text] ) -> Set: - """Apply vectorizer's preprocessor on texts to get the vocabulary from texts. + """Applies vectorizer's preprocessor on texts to get the vocabulary from texts. Args: vectorizer: Sklearn's count vectorizer which has been pre-configured. texts: Examples from which the vocabulary should be constructed Returns: - Set of unique vocabulary words extracted. + Unique vocabulary words extracted. """ analyzer = vectorizer.build_analyzer() vocabulary_words = set() @@ -493,8 +493,8 @@ def _construct_vocabulary_from_texts( def _attribute_texts_is_non_empty(attribute_texts: List[Text]) -> bool: return any(attribute_texts) - def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]): - """Construct the vectorizers and train them with a shared vocab.""" + def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]) -> None: + """Constructs the vectorizers and train them with a shared vocab.""" combined_cleaned_texts = [] for attribute in self._attributes: combined_cleaned_texts += attribute_texts[attribute] @@ -517,7 +517,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]): else: self._fit_loaded_vectorizer(TEXT, combined_cleaned_texts) - def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]): + def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]) -> None: """Construct the vectorizers and train them with an independent vocab.""" if not self.finetune_mode: self.vectorizers = self._create_independent_vocab_vectorizers( @@ -550,7 +550,7 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]) def _fit_loaded_vectorizer( self, attribute: Text, attribute_texts: List[Text] ) -> None: - """Fit training texts to a previously trained count vectorizer. + """Fits training texts to a previously trained count vectorizer. We do not use the `.fit()` method because the new unseen words should occupy the buffer slots of the vocabulary. @@ -570,7 +570,7 @@ def _fit_loaded_vectorizer( def _fit_vectorizer_from_scratch( self, attribute: Text, attribute_texts: List[Text] ) -> None: - """Fit training texts to an untrained count vectorizer. + """Fits training texts to an untrained count vectorizer. Args: attribute: Message attribute for which the vectorizer is to be trained. diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index c177c8afbfc9..9d24464fef21 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -295,7 +295,7 @@ def load( cached_component: Optional["RegexFeaturizer"] = None, **kwargs: Any, ) -> "RegexFeaturizer": - """Load a previously trained component. + """Loads a previously trained component. Args: meta: Configuration of trained component. From c71fcd86cb295476978d7d246fba7cfde1787d24 Mon Sep 17 00:00:00 2001 From: Daksh Date: Tue, 8 Dec 2020 00:44:52 +0100 Subject: [PATCH 19/45] review comments and add tests for loaded diet and rs --- rasa/nlu/classifiers/diet_classifier.py | 8 ++-- .../count_vectors_featurizer.py | 31 +++++++------ .../sparse_featurizer/regex_featurizer.py | 16 +++---- rasa/utils/tensorflow/models.py | 12 +++++ tests/nlu/classifiers/test_diet_classifier.py | 24 +++++++--- tests/nlu/selectors/test_selectors.py | 44 +++++++++++++++++++ 6 files changed, 104 insertions(+), 31 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 8a9f27c65815..9b5f9a115605 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -311,7 +311,6 @@ def __init__( finetune_mode: bool = False, ) -> None: """Declare instance variables with default values.""" - if component_config is not None and EPOCHS not in component_config: rasa.shared.utils.io.raise_warning( f"Please configure the number of '{EPOCHS}' in your configuration file." @@ -970,6 +969,7 @@ def load( model_dir: Text = None, model_metadata: Metadata = None, cached_component: Optional["DIETClassifier"] = None, + should_finetune: bool = False, **kwargs: Any, ) -> "DIETClassifier": """Loads the trained model from the provided directory.""" @@ -981,8 +981,6 @@ def load( ) return cls(component_config=meta) - finetune_mode = kwargs.pop("should_finetune", False) - ( index_label_id_mapping, entity_tag_specs, @@ -999,7 +997,7 @@ def load( meta, data_example, model_dir, - finetune_mode=finetune_mode, + finetune_mode=should_finetune, ) return cls( @@ -1007,7 +1005,7 @@ def load( index_label_id_mapping=index_label_id_mapping, entity_tag_specs=entity_tag_specs, model=model, - finetune_mode=finetune_mode, + finetune_mode=should_finetune, ) @classmethod diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index b7d92678d6c0..e1a0f8751142 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -371,7 +371,7 @@ def _get_starting_empty_index(vocabulary: Dict[Text, int]) -> int: return len(vocabulary) def _update_vectorizer_vocabulary( - self, attribute: Text, vocabulary: Set[Text] + self, attribute: Text, new_vocabulary: Set[Text] ) -> None: """Update the existing vocabulary of the vectorizer with new unseen words. @@ -379,27 +379,34 @@ def _update_vectorizer_vocabulary( Args: attribute: Message attribute for which vocabulary should be updated. - vocabulary: Set of words to expand the vocabulary with if they are unseen. + new_vocabulary: Set of words to expand the vocabulary with if they are unseen. """ existing_vocabulary: Dict[Text, int] = self.vectorizers[attribute].vocabulary - available_empty_index = self._get_starting_empty_index(existing_vocabulary) - if len(vocabulary) > len(existing_vocabulary): + if len(new_vocabulary) > len(existing_vocabulary): logger.warning( - f"New data contains vocabulary of size {len(vocabulary)} for attribute {attribute} " + f"New data contains vocabulary of size {len(new_vocabulary)} for attribute {attribute} " f"which is larger than the maximum vocabulary size({len(existing_vocabulary)}) " f"of the original model. Some tokens will have to be dropped " f"in order to continue training. It is advised to re-train the " f"model from scratch on complete data." ) + self._merge_new_vocabulary_tokens(existing_vocabulary, new_vocabulary) + self._set_vocabulary(attribute, existing_vocabulary) + + def _merge_new_vocabulary_tokens( + self, existing_vocabulary: Dict[Text, int], vocabulary: Set[Text] + ): + available_empty_index = self._get_starting_empty_index(existing_vocabulary) for token in vocabulary: if token not in existing_vocabulary: existing_vocabulary[token] = available_empty_index del existing_vocabulary[f"{BUFFER_SLOTS_PREFIX}{available_empty_index}"] available_empty_index += 1 if available_empty_index == len(existing_vocabulary): + # We have exhausted all available vocabulary slots. + # Drop the remaining vocabulary. break - self._set_vocabulary(attribute, existing_vocabulary) def _get_additional_vocabulary_size( self, attribute: Text, existing_vocabulary_size: int @@ -435,7 +442,7 @@ def _get_additional_vocabulary_size( return max(MIN_ADDITIONAL_CVF_VOCABULARY, int(existing_vocabulary_size * 0.5)) def _add_buffer_to_vocabulary(self, attribute: Text) -> None: - """Add extra tokens to vocabulary for incremental training. + """Adds extra tokens to vocabulary for incremental training. These extra tokens act as buffer slots which are used up sequentially when more data is received as part of incremental training. Each of @@ -459,7 +466,7 @@ def _add_buffer_to_vocabulary(self, attribute: Text) -> None: def _set_vocabulary( self, attribute: Text, original_vocabulary: Dict[Text, int] ) -> None: - """Set the vocabulary of the vectorizer of attribute. + """Sets the vocabulary of the vectorizer of attribute. Args: attribute: Message attribute for which vocabulary should be set @@ -518,7 +525,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]): self._fit_loaded_vectorizer(TEXT, combined_cleaned_texts) def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]): - """Construct the vectorizers and train them with an independent vocab.""" + """Constructs the vectorizers and train them with an independent vocab.""" if not self.finetune_mode: self.vectorizers = self._create_independent_vocab_vectorizers( { @@ -854,11 +861,9 @@ def load( model_dir: Optional[Text] = None, model_metadata: Optional[Metadata] = None, cached_component: Optional["CountVectorsFeaturizer"] = None, + should_finetune: bool = False, **kwargs: Any, ) -> "CountVectorsFeaturizer": - - finetune_mode = kwargs.pop("should_finetune", False) - file_name = meta.get("file") featurizer_file = os.path.join(model_dir, file_name) @@ -878,7 +883,7 @@ def load( meta, vocabulary=vocabulary ) - ftr = cls(meta, vectorizers, finetune_mode) + ftr = cls(meta, vectorizers, should_finetune) # make sure the vocabulary has been loaded correctly for attribute in vectorizers: diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index c177c8afbfc9..20c47a134fbc 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -62,7 +62,7 @@ def __init__( pattern_vocabulary_stats: Optional[Dict[Text, int]] = None, finetune_mode: bool = False, ) -> None: - """Construct new features for regexes and lookup table using regex expressions. + """Constructs new features for regexes and lookup table using regex expressions. Args: component_config: Configuration for the component @@ -92,7 +92,7 @@ def __init__( @lazy_property def vocabulary_stats(self) -> Dict[Text, int]: - """Compute total vocabulary size and how much of it is consumed. + """Computes total vocabulary size and how much of it is consumed. Returns: Computed vocabulary size and number of filled vocabulary slots. @@ -112,7 +112,7 @@ def vocabulary_stats(self) -> Dict[Text, int]: return self.pattern_vocabulary_stats def _merge_new_patterns(self, new_patterns: List[Dict[Text, Text]]): - """Update already known patterns with new patterns extracted from data. + """Updates already known patterns with new patterns extracted from data. Args: new_patterns: Patterns extracted from training data and to be merged with known patterns. @@ -149,7 +149,7 @@ def _merge_new_patterns(self, new_patterns: List[Dict[Text, Text]]): ) def _get_num_additional_slots(self) -> int: - """Compute number of additional pattern slots available in vocabulary on top of known patterns.""" + """Computes number of additional pattern slots available in vocabulary on top of known patterns.""" if self.number_additional_patterns is None: # We take twice the number of currently defined # regex patterns as the number of additional @@ -169,7 +169,7 @@ def train( config: Optional[RasaNLUModelConfig] = None, **kwargs: Any, ) -> None: - """Train the component with all patterns extracted from training data. + """Trains the component with all patterns extracted from training data. Args: training_data: Training data consisting of training examples and patterns available. @@ -293,6 +293,7 @@ def load( model_dir: Optional[Text] = None, model_metadata: Optional[Metadata] = None, cached_component: Optional["RegexFeaturizer"] = None, + should_finetune: bool = False, **kwargs: Any, ) -> "RegexFeaturizer": """Load a previously trained component. @@ -302,10 +303,9 @@ def load( model_dir: Path where trained pipeline is stored. model_metadata: Metadata for the trained pipeline. cached_component: Previously cached component(if any). + should_finetune: Indicates whether to load the component for further finetuning. **kwargs: """ - finetune_mode = kwargs.pop("should_finetune", False) - file_name = meta.get("file") patterns_file_name = os.path.join(model_dir, file_name + ".patterns.pkl") @@ -327,7 +327,7 @@ def load( meta, known_patterns=known_patterns, pattern_vocabulary_stats=vocabulary_stats, - finetune_mode=finetune_mode, + finetune_mode=should_finetune, ) def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]: diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py index 089afb9ef2a5..c1ac9c0fc696 100644 --- a/rasa/utils/tensorflow/models.py +++ b/rasa/utils/tensorflow/models.py @@ -334,6 +334,18 @@ def load( *args, **kwargs, ) -> "RasaModel": + """Loads a model from the given weights. + + Args: + model_file_name: Path to file containing model weights. + model_data_example: Example data point to construct the model architecture. + finetune_mode: Indicates whether to load the model for further finetuning. + *args: Any other non key-worded arguments. + **kwargs: Any other key-worded arguments. + + Returns: + Loaded model with weights appropriately set. + """ logger.debug(f"Loading the model with finetune_mode={finetune_mode}...") # create empty model model = cls(*args, **kwargs) diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py index dbf6e3ab11f0..9dc938f4ab58 100644 --- a/tests/nlu/classifiers/test_diet_classifier.py +++ b/tests/nlu/classifiers/test_diet_classifier.py @@ -3,6 +3,7 @@ import numpy as np import pytest from unittest.mock import Mock +from typing import List, Tuple, Text, Dict, Any, Optional from rasa.shared.nlu.training_data.features import Features from rasa.nlu import train @@ -108,7 +109,10 @@ def test_check_labels_features_exist(messages, expected): async def _train_persist_load_with_different_settings( - pipeline, component_builder, tmp_path + pipeline: List[Dict[Text, Any]], + component_builder: ComponentBuilder, + tmp_path: Path, + should_finetune: bool, ): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"}) @@ -122,7 +126,11 @@ async def _train_persist_load_with_different_settings( assert trainer.pipeline assert trained.pipeline - loaded = Interpreter.load(persisted_path, component_builder) + loaded = Interpreter.load( + persisted_path, + component_builder, + new_config=_config if should_finetune else None, + ) assert loaded.pipeline assert loaded.parse("Rasa is great!") == trained.parse("Rasa is great!") @@ -130,7 +138,7 @@ async def _train_persist_load_with_different_settings( @pytest.mark.skip_on_windows async def test_train_persist_load_with_different_settings_non_windows( - component_builder, tmpdir + component_builder: ComponentBuilder, tmpdir: Path ): pipeline = [ { @@ -142,7 +150,10 @@ async def test_train_persist_load_with_different_settings_non_windows( {"name": "DIETClassifier", MASKED_LM: True, EPOCHS: 1}, ] await _train_persist_load_with_different_settings( - pipeline, component_builder, tmpdir + pipeline, component_builder, tmpdir, False + ) + await _train_persist_load_with_different_settings( + pipeline, component_builder, tmpdir, True ) @@ -153,7 +164,10 @@ async def test_train_persist_load_with_different_settings(component_builder, tmp {"name": "DIETClassifier", LOSS_TYPE: "margin", EPOCHS: 1}, ] await _train_persist_load_with_different_settings( - pipeline, component_builder, tmpdir + pipeline, component_builder, tmpdir, False + ) + await _train_persist_load_with_different_settings( + pipeline, component_builder, tmpdir, True ) diff --git a/tests/nlu/selectors/test_selectors.py b/tests/nlu/selectors/test_selectors.py index d25d28c0367e..560bf9f9b67c 100644 --- a/tests/nlu/selectors/test_selectors.py +++ b/tests/nlu/selectors/test_selectors.py @@ -1,6 +1,7 @@ from pathlib import Path import pytest +from typing import List, Dict, Text, Any from rasa.nlu import train from rasa.nlu.components import ComponentBuilder @@ -238,3 +239,46 @@ async def test_train_model_checkpointing( """ all_files = list(best_model_file.rglob("*.*")) assert len(all_files) > 4 + + +async def _train_persist_load_with_different_settings( + pipeline: List[Dict[Text, Any]], + component_builder: ComponentBuilder, + tmp_path: Path, + should_finetune: bool, +): + _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"}) + + (trainer, trained, persisted_path) = await train( + _config, + path=str(tmp_path), + data="data/examples/rasa/demo-rasa.md", + component_builder=component_builder, + ) + + assert trainer.pipeline + assert trained.pipeline + + loaded = Interpreter.load( + persisted_path, + component_builder, + new_config=_config if should_finetune else None, + ) + + assert loaded.pipeline + assert loaded.parse("Rasa is great!") == trained.parse("Rasa is great!") + + +@pytest.mark.skip_on_windows +async def test_train_persist_load(component_builder: ComponentBuilder, tmpdir: Path): + pipeline = [ + {"name": "WhitespaceTokenizer"}, + {"name": "CountVectorsFeaturizer"}, + {"name": "ResponseSelector", EPOCHS: 1}, + ] + await _train_persist_load_with_different_settings( + pipeline, component_builder, tmpdir, False + ) + await _train_persist_load_with_different_settings( + pipeline, component_builder, tmpdir, True + ) From 95a3a5432e2c8942f965a39da14eb42906982459 Mon Sep 17 00:00:00 2001 From: Daksh Date: Tue, 8 Dec 2020 10:11:38 +0100 Subject: [PATCH 20/45] fix regex tests --- tests/nlu/featurizers/test_regex_featurizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index 1e1a20d4e30f..5e2a4c31a3ca 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -166,7 +166,9 @@ def test_lookup_tables_without_use_word_boundaries( {"name": "cites", "elements": ["北京", "上海", "广州", "深圳", "杭州"],}, {"name": "dates", "elements": ["昨天", "今天", "明天", "后天"],}, ] - ftr = RegexFeaturizer({"use_word_boundaries": False}) + ftr = RegexFeaturizer( + {"use_word_boundaries": False, "number_additional_patterns": 0} + ) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) From 3b4ccc094108fd0b9f1b92b85d571482ef7fcc27 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Tue, 8 Dec 2020 14:11:55 +0100 Subject: [PATCH 21/45] use kwargs --- rasa/core/policies/ensemble.py | 44 +++++++++------------ rasa/core/policies/fallback.py | 6 +-- rasa/core/policies/form_policy.py | 4 +- rasa/core/policies/mapping_policy.py | 6 +-- rasa/core/policies/memoization.py | 5 +-- rasa/core/policies/policy.py | 38 +++++++++++------- rasa/core/policies/rule_policy.py | 5 +-- rasa/core/policies/sklearn_policy.py | 17 +++----- rasa/core/policies/ted_policy.py | 16 +++----- rasa/core/policies/two_stage_fallback.py | 5 +-- tests/core/conftest.py | 4 +- tests/core/test_ensemble.py | 50 ++++++++++-------------- tests/core/test_policies.py | 21 +++++++--- 13 files changed, 102 insertions(+), 119 deletions(-) diff --git a/rasa/core/policies/ensemble.py b/rasa/core/policies/ensemble.py index 7599deabedc2..40d9e2192b1e 100644 --- a/rasa/core/policies/ensemble.py +++ b/rasa/core/policies/ensemble.py @@ -335,43 +335,37 @@ def load( dir_name = f"policy_{i}_{policy_cls.__name__}" policy_path = os.path.join(path, dir_name) + context = {} if new_config: - if "should_finetune" not in rasa.shared.utils.common.arguments_of( - policy_cls.load - ): - raise UnsupportedDialogueModelError( - f"{policy_cls.__name__} does not support fine-tuning. " - f"To support fine-tuning the `load` method must have the " - f"argument `should_finetune`. This argument should be added to " - f"all policies by Rasa Open Source 3.0.0." - ) + context["should_finetune"] = True config_for_policy = new_config["policies"][i] epochs = cls._get_updated_epochs( policy_cls, config_for_policy, finetuning_epoch_fraction ) if epochs: - if "epoch_override" not in rasa.shared.utils.common.arguments_of( - policy_cls.load - ): - raise UnsupportedDialogueModelError( - f"{policy_cls.__name__} does not support fine-tuning. " - f"To support fine-tuning the `load` method must have the " - f"argument `epoch_override` if the policy uses epochs." - f"This argument should be added to all policies by " - f"Rasa Open Source 3.0.0." - ) - - policy = policy_cls.load( - policy_path, should_finetune=True, epoch_override=epochs + context["epoch_override"] = epochs + + if "kwargs" not in rasa.shared.utils.common.arguments_of(policy_cls.load): + if context: + raise UnsupportedDialogueModelError( + f"`{policy_cls.__name__}.load` does not accept **kwargs. " + f"Attempting to pass {context} to the policy. " + f"This argument should be added to all policies by " + f"Rasa Open Source 3.0.0." ) else: - policy = policy_cls.load(policy_path, should_finetune=True) - else: - policy = policy_cls.load(policy_path) + rasa.shared.utils.io.raise_deprecation_warning( + f"{policy_cls.__name__} `load` method does not " + f"accept **kwargs. This is required for contextual" + f" information e.g. the flag `should_finetune`.", + warn_until_version="3.0.0", + ) + policy = policy_cls.load(policy_path, **context) cls._ensure_loaded_policy(policy, policy_cls, policy_name) policies.append(policy) + ensemble_cls = rasa.shared.utils.common.class_from_module_path( metadata["ensemble_name"] ) diff --git a/rasa/core/policies/fallback.py b/rasa/core/policies/fallback.py index 5b7f9d19febb..13331f45840c 100644 --- a/rasa/core/policies/fallback.py +++ b/rasa/core/policies/fallback.py @@ -39,7 +39,7 @@ def __init__( ambiguity_threshold: float = DEFAULT_NLU_FALLBACK_AMBIGUITY_THRESHOLD, core_threshold: float = DEFAULT_CORE_FALLBACK_THRESHOLD, fallback_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, - should_finetune: bool = False, + **kwargs, ) -> None: """Create a new Fallback policy. @@ -55,10 +55,8 @@ def __init__( ambiguity_threshold: threshold for minimum difference between confidences of the top two predictions fallback_action_name: name of the action to execute as a fallback - should_finetune: Indicates if the model components will be fine-tuned. - """ - super().__init__(priority=priority, should_finetune=should_finetune) + super().__init__(priority=priority, **kwargs) self.nlu_threshold = nlu_threshold self.ambiguity_threshold = ambiguity_threshold diff --git a/rasa/core/policies/form_policy.py b/rasa/core/policies/form_policy.py index 139584870841..e2c7f3d6b640 100644 --- a/rasa/core/policies/form_policy.py +++ b/rasa/core/policies/form_policy.py @@ -35,7 +35,7 @@ def __init__( featurizer: Optional[TrackerFeaturizer] = None, priority: int = FORM_POLICY_PRIORITY, lookup: Optional[Dict] = None, - should_finetune: bool = False, + **kwargs, ) -> None: # max history is set to 2 in order to capture @@ -45,7 +45,7 @@ def __init__( priority=priority, max_history=2, lookup=lookup, - should_finetune=should_finetune, + **kwargs, ) rasa.shared.utils.io.raise_deprecation_warning( diff --git a/rasa/core/policies/mapping_policy.py b/rasa/core/policies/mapping_policy.py index 039ae9287d28..594895713524 100644 --- a/rasa/core/policies/mapping_policy.py +++ b/rasa/core/policies/mapping_policy.py @@ -43,11 +43,9 @@ class MappingPolicy(Policy): def _standard_featurizer() -> None: return None - def __init__( - self, priority: int = MAPPING_POLICY_PRIORITY, should_finetune: bool = False, - ) -> None: + def __init__(self, priority: int = MAPPING_POLICY_PRIORITY, **kwargs,) -> None: """Create a new Mapping policy.""" - super().__init__(priority=priority, should_finetune=should_finetune) + super().__init__(priority=priority, **kwargs) rasa.shared.utils.io.raise_deprecation_warning( f"'{MappingPolicy.__name__}' is deprecated and will be removed in " diff --git a/rasa/core/policies/memoization.py b/rasa/core/policies/memoization.py index 113c9feb541f..eded89d74ede 100644 --- a/rasa/core/policies/memoization.py +++ b/rasa/core/policies/memoization.py @@ -71,7 +71,7 @@ def __init__( priority: int = MEMOIZATION_POLICY_PRIORITY, max_history: Optional[int] = MAX_HISTORY_NOT_SET, lookup: Optional[Dict] = None, - should_finetune: bool = False, + **kwargs, ) -> None: """Initialize the policy. @@ -81,7 +81,6 @@ def __init__( max_history: maximum history to take into account when featurizing trackers lookup: a dictionary that stores featurized tracker states and predicted actions for them - should_finetune: Indicates if the model components will be fine-tuned. """ if max_history == MAX_HISTORY_NOT_SET: max_history = OLD_DEFAULT_MAX_HISTORY # old default value @@ -98,7 +97,7 @@ def __init__( if not featurizer: featurizer = self._standard_featurizer(max_history) - super().__init__(featurizer, priority, should_finetune=should_finetune) + super().__init__(featurizer, priority, **kwargs) self.max_history = self.featurizer.max_history self.lookup = lookup if lookup is not None else {} diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index 084584b24e4d..e73de2a981ab 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -113,12 +113,11 @@ def __init__( self, featurizer: Optional[TrackerFeaturizer] = None, priority: int = DEFAULT_POLICY_PRIORITY, - should_finetune: bool = False, + **kwargs, ) -> None: - """Construct a new Policy object.""" + """Constructs a new Policy object.""" self.__featurizer = self._create_featurizer(featurizer) self.priority = priority - self.should_finetune = should_finetune @property def featurizer(self): @@ -279,12 +278,11 @@ def persist(self, path: Union[Text, Path]) -> None: rasa.shared.utils.io.dump_obj_as_json_to_file(file, self._metadata()) @classmethod - def load(cls, path: Union[Text, Path], should_finetune: bool = False,) -> "Policy": + def load(cls, path: Union[Text, Path], **kwargs) -> "Policy": """Loads a policy from path. Args: path: Path to load policy from. - should_finetune: Indicates if the model components will be fine-tuned. Returns: An instance of `Policy`. @@ -293,20 +291,32 @@ def load(cls, path: Union[Text, Path], should_finetune: bool = False,) -> "Polic if metadata_file.is_file(): data = json.loads(rasa.shared.utils.io.read_file(metadata_file)) - if should_finetune: - if "should_finetune" not in rasa.shared.utils.common.arguments_of(cls): - raise UnsupportedDialogueModelError( - f"{cls.__name__} does not support fine-tuning. " - f"To support fine-tuning the `__init__` method must have the " - f"argument `should_finetune`. This argument should be added to " - f"all policies by Rasa Open Source 3.0.0." - ) - data["should_finetune"] = should_finetune if (Path(path) / FEATURIZER_FILE).is_file(): featurizer = TrackerFeaturizer.load(path) data["featurizer"] = featurizer + data = {} + if "should_finetune" in kwargs: + data["should_finetune"] = kwargs["should_finetune"] + + constructor_args = rasa.shared.utils.common.arguments_of(cls) + if "kwargs" not in constructor_args: + if set(data.keys()).issubset(set(constructor_args)): + rasa.shared.utils.io.raise_deprecation_warning( + f"{cls.__name__} `__init__` method does not accept **kwargs " + f"This is required for contextual information e.g. the flag " + f"`should_finetune`.", + warn_until_version="3.0.0", + ) + else: + raise UnsupportedDialogueModelError( + f"{cls.__name__} `__init__` method does not accept **kwargs. " + f"Attempting to pass {data} to the policy. " + f"This argument should be added to all policies by " + f"Rasa Open Source 3.0.0." + ) + return cls(**data) logger.info( diff --git a/rasa/core/policies/rule_policy.py b/rasa/core/policies/rule_policy.py index beb4d322d2cc..6616ea0dcd4a 100644 --- a/rasa/core/policies/rule_policy.py +++ b/rasa/core/policies/rule_policy.py @@ -108,7 +108,7 @@ def __init__( enable_fallback_prediction: bool = True, restrict_rules: bool = True, check_for_contradictions: bool = True, - should_finetune: bool = False, + **kwargs, ) -> None: """Create a `RulePolicy` object. @@ -129,7 +129,6 @@ def __init__( user message. This is used to avoid that users build a state machine using the rules. check_for_contradictions: Check for contradictions. - should_finetune: Indicates if the model components will be fine-tuned. """ self._core_fallback_threshold = core_fallback_threshold self._fallback_action_name = core_fallback_action_name @@ -146,7 +145,7 @@ def __init__( priority=priority, max_history=None, lookup=lookup, - should_finetune=should_finetune, + **kwargs, ) @classmethod diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py index eb8694a4b305..750f0bc7e763 100644 --- a/rasa/core/policies/sklearn_policy.py +++ b/rasa/core/policies/sklearn_policy.py @@ -65,7 +65,6 @@ def __init__( label_encoder: LabelEncoder = LabelEncoder(), shuffle: bool = True, zero_state_features: Optional[Dict[Text, List["Features"]]] = None, - should_finetune: bool = False, **kwargs: Any, ) -> None: """Create a new sklearn policy. @@ -87,7 +86,6 @@ def __init__( *inverse_transform* method. shuffle: Whether to shuffle training data. zero_state_features: Contains default feature values for attributes - should_finetune: Indicates if the model components will be fine-tuned. """ if featurizer: if not isinstance(featurizer, MaxHistoryTrackerFeaturizer): @@ -107,7 +105,7 @@ def __init__( ) featurizer = self._standard_featurizer(max_history) - super().__init__(featurizer, priority, should_finetune=should_finetune) + super().__init__(featurizer, priority, **kwargs) self.model = model or self._default_model() self.cv = cv @@ -305,12 +303,7 @@ def persist(self, path: Union[Text, Path]) -> None: ) @classmethod - def load( - cls, - path: Union[Text, Path], - should_finetune: bool = False, - epoch_override: Optional[float] = None, - ) -> Policy: + def load(cls, path: Union[Text, Path], **kwargs,) -> Policy: """See the docstring for `Policy.Load`.""" filename = Path(path) / "sklearn_model.pkl" zero_features_filename = Path(path) / "zero_state_features.pkl" @@ -330,9 +323,9 @@ def load( meta = json.loads(rasa.shared.utils.io.read_file(meta_file)) zero_state_features = io_utils.pickle_load(zero_features_filename) - data = {"should_finetune": should_finetune} - if epoch_override: - data[EPOCHS] = epoch_override + data = {"should_finetune": kwargs.get("should_finetune", False)} + if "epoch_override" in kwargs: + data[EPOCHS] = kwargs["epoch_override"] policy = cls( featurizer=featurizer, diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index 75c98a045c9b..2442c257ef84 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -217,14 +217,13 @@ def __init__( max_history: Optional[int] = None, model: Optional[RasaModel] = None, zero_state_features: Optional[Dict[Text, List["Features"]]] = None, - should_finetune: bool = False, **kwargs: Any, ) -> None: """Declare instance variables with default values.""" if not featurizer: featurizer = self._standard_featurizer(max_history) - super().__init__(featurizer, priority, should_finetune=should_finetune) + super().__init__(featurizer, priority, **kwargs) if isinstance(featurizer, FullDialogueTrackerFeaturizer): self.is_full_dialogue_featurizer_used = True else: @@ -437,12 +436,7 @@ def persist(self, path: Union[Text, Path]) -> None: ) @classmethod - def load( - cls, - path: Union[Text, Path], - should_finetune: bool = False, - epoch_override: Optional[float] = None, - ) -> "TEDPolicy": + def load(cls, path: Union[Text, Path], **kwargs,) -> "TEDPolicy": """Loads a policy from the storage. **Needs to load its featurizer** @@ -506,9 +500,9 @@ def load( ) model.build_for_predict(predict_data_example) - meta["should_finetune"] = should_finetune - if epoch_override: - meta[EPOCHS] = epoch_override + meta["should_finetune"] = kwargs.get("should_finetune", False) + if "epoch_override" in kwargs: + meta[EPOCHS] = kwargs["epoch_override"] return cls( featurizer=featurizer, diff --git a/rasa/core/policies/two_stage_fallback.py b/rasa/core/policies/two_stage_fallback.py index c8238189a280..04335d207ca4 100644 --- a/rasa/core/policies/two_stage_fallback.py +++ b/rasa/core/policies/two_stage_fallback.py @@ -62,7 +62,7 @@ def __init__( fallback_core_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, fallback_nlu_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, deny_suggestion_intent_name: Text = USER_INTENT_OUT_OF_SCOPE, - should_finetune: bool = False, + **kwargs, ) -> None: """Create a new Two-stage Fallback policy. @@ -83,7 +83,6 @@ def __init__( denies the recognised intent for the second time. deny_suggestion_intent_name: The name of the intent which is used to detect that the user denies the suggested intents. - should_finetune: Indicates if the model components will be fine-tuned. """ super().__init__( priority, @@ -91,7 +90,7 @@ def __init__( ambiguity_threshold, core_threshold, fallback_core_action_name, - should_finetune=should_finetune, + **kwargs, ) self.fallback_nlu_action_name = fallback_nlu_action_name diff --git a/tests/core/conftest.py b/tests/core/conftest.py index 3fa2fa2ac965..095342e288de 100644 --- a/tests/core/conftest.py +++ b/tests/core/conftest.py @@ -72,8 +72,8 @@ def as_feature(self): # noinspection PyAbstractClass,PyUnusedLocal,PyMissingConstructor class ExamplePolicy(Policy): - def __init__(self, example_arg, should_finetune=False): - super(ExamplePolicy, self).__init__(should_finetune=should_finetune) + def __init__(self, *args, **kwargs): + super(ExamplePolicy, self).__init__(*args, **kwargs) class MockedMongoTrackerStore(MongoTrackerStore): diff --git a/tests/core/test_ensemble.py b/tests/core/test_ensemble.py index 1b85c6108fdf..02d2825779d2 100644 --- a/tests/core/test_ensemble.py +++ b/tests/core/test_ensemble.py @@ -2,6 +2,7 @@ from typing import List, Any, Text, Optional, Union from unittest.mock import Mock +from _pytest.capture import CaptureFixture from _pytest.monkeypatch import MonkeyPatch import pytest import copy @@ -40,7 +41,7 @@ class WorkingPolicy(Policy): @classmethod - def load(cls, _) -> Policy: + def load(cls, *args, **kwargs) -> Policy: return WorkingPolicy() def persist(self, _) -> None: @@ -77,46 +78,35 @@ def test_policy_loading_simple(tmp_path: Path): assert original_policy_ensemble.policies == loaded_policy_ensemble.policies -class NoFinetunePolicy(Policy): +class PolicyWithoutLoadKwargs(Policy): @classmethod def load(cls, path: Union[Text, Path]) -> Policy: - return NoFinetunePolicy() + return PolicyWithoutLoadKwargs() def persist(self, _) -> None: pass -class NoFinetunePolicyEpochs(Policy): - @classmethod - def load(cls, path: Union[Text, Path], should_finetune: bool = False) -> Policy: - return NoFinetunePolicy() - - def persist(self, _) -> None: - pass - - -def test_policy_loading_unsupported_finetune(tmp_path: Path): - original_policy_ensemble = PolicyEnsemble([NoFinetunePolicy()]) +def test_policy_loading_no_kwargs_with_context(tmp_path: Path): + original_policy_ensemble = PolicyEnsemble([PolicyWithoutLoadKwargs()]) original_policy_ensemble.train([], None, RegexInterpreter()) original_policy_ensemble.persist(str(tmp_path)) with pytest.raises(UnsupportedDialogueModelError) as execinfo: - PolicyEnsemble.load(str(tmp_path), new_config={"some": "new_config"}) - assert "NoFinetunePolicy does not support fine-tuning." in str(execinfo.value) + PolicyEnsemble.load(str(tmp_path), new_config={"policies": [{}]}) + assert "`PolicyWithoutLoadKwargs.load` does not accept **kwargs" in str( + execinfo.value + ) -def test_policy_loading_unsupported_finetune_epochs( - tmp_path: Path, monkeypatch: MonkeyPatch +def test_policy_loading_no_kwargs_with_no_context( + tmp_path: Path, capsys: CaptureFixture ): - original_policy_ensemble = PolicyEnsemble([NoFinetunePolicyEpochs()]) + original_policy_ensemble = PolicyEnsemble([PolicyWithoutLoadKwargs()]) original_policy_ensemble.train([], None, RegexInterpreter()) original_policy_ensemble.persist(str(tmp_path)) - monkeypatch.setattr(PolicyEnsemble, "_get_updated_epochs", Mock(return_value=10)) - - with pytest.raises(UnsupportedDialogueModelError) as execinfo: - PolicyEnsemble.load(str(tmp_path), new_config={"policies": [{}]}) - assert "NoFinetunePolicyEpochs does not support fine-tuning." in str(execinfo.value) - assert "if the policy uses epochs." in str(execinfo.value) + with pytest.warns(FutureWarning): + PolicyEnsemble.load(str(tmp_path)) class ConstantPolicy(Policy): @@ -128,9 +118,9 @@ def __init__( is_end_to_end_prediction: bool = False, events: Optional[List[Event]] = None, optional_events: Optional[List[Event]] = None, - should_finetune: bool = False, + **kwargs, ) -> None: - super().__init__(priority=priority, should_finetune=should_finetune) + super().__init__(priority=priority, **kwargs) self.predict_index = predict_index self.confidence = confidence self.is_end_to_end_prediction = is_end_to_end_prediction @@ -138,7 +128,7 @@ def __init__( self.optional_events = optional_events or [] @classmethod - def load(cls, _) -> Policy: + def load(cls, args, **kwargs) -> Policy: pass def persist(self, _) -> None: @@ -350,7 +340,7 @@ def test_fallback_wins_over_mapping(): class LoadReturnsNonePolicy(Policy): @classmethod - def load(cls, _) -> None: + def load(cls, *args, **kwargs) -> None: return None def persist(self, _) -> None: @@ -386,7 +376,7 @@ def test_policy_loading_load_returns_none(tmp_path: Path): class LoadReturnsWrongTypePolicy(Policy): @classmethod - def load(cls, _) -> Text: + def load(cls, *args, **kwargs) -> Text: return "" def persist(self, _) -> None: diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index de07a275c91c..47c80d586730 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -1233,8 +1233,8 @@ def test_deprecation_warnings_for_old_rule_like_policies(policy: Type[Policy]): policy(None) -class NoFinetunePolicy(Policy): - def __init__(self): +class PolicyWithoutInitKwargs(Policy): + def __init__(self, *args): pass def persist(self, _) -> None: @@ -1245,10 +1245,19 @@ def _metadata_filename(cls): return "no_finetune_policy" -def test_loading_unsupported_policy_as_finetune(tmp_path: Path): +def test_loading_policy_with_no_constructor_kwargs(tmp_path: Path): rasa.shared.utils.io.write_text_file( - "{}", tmp_path / NoFinetunePolicy._metadata_filename() + "{}", tmp_path / PolicyWithoutInitKwargs._metadata_filename() ) with pytest.raises(UnsupportedDialogueModelError) as execinfo: - NoFinetunePolicy.load(str(tmp_path), should_finetune=True) - assert "NoFinetunePolicy does not support fine-tuning." in str(execinfo.value) + PolicyWithoutInitKwargs.load(str(tmp_path), should_finetune=True) + assert "PolicyWithoutInitKwargs `__init__` method does not accept **kwargs." in str( + execinfo.value + ) + + +def test_loading_policy_with_no_constructor_kwargs_but_required_args(tmp_path: Path): + rasa.shared.utils.io.write_text_file( + "{}", tmp_path / PolicyWithoutInitKwargs._metadata_filename() + ) + PolicyWithoutInitKwargs.load(str(tmp_path)) From 5dc82b752eb1d54bee501671e3a2379911ce5864 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Tue, 8 Dec 2020 14:32:44 +0100 Subject: [PATCH 22/45] fix --- rasa/core/policies/policy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index e73de2a981ab..7265b7f9cc59 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -296,7 +296,6 @@ def load(cls, path: Union[Text, Path], **kwargs) -> "Policy": featurizer = TrackerFeaturizer.load(path) data["featurizer"] = featurizer - data = {} if "should_finetune" in kwargs: data["should_finetune"] = kwargs["should_finetune"] @@ -413,7 +412,7 @@ def __init__( you return as they can potentially influence the conversation flow. is_end_to_end_prediction: `True` if the prediction used the text of the user message instead of the intent. - """ + """ self.probabilities = probabilities self.policy_name = policy_name self.policy_priority = (policy_priority,) From 311ea47e914e2865eb48d9458b22bce88dfa75c6 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Tue, 8 Dec 2020 14:39:51 +0100 Subject: [PATCH 23/45] fix train tests --- tests/test_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_train.py b/tests/test_train.py index 62d3a269e3bb..cb6ac099ca2f 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -440,7 +440,7 @@ def test_model_finetuning_core( ted = model_to_finetune.policy_ensemble.policies[0] assert ted.config[EPOCHS] == 10 - assert ted.should_finetune is True + assert ted.config["should_finetune"] is True def test_model_finetuning_core_with_default_epochs( From 0bd119c0ab2b75ac0ff94bd09ebe8c7aecdb4b52 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Tue, 8 Dec 2020 14:55:43 +0100 Subject: [PATCH 24/45] More test fixes --- tests/core/test_policies.py | 3 ++- tests/test_train.py | 13 ++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 47c80d586730..022bdfe40376 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -1260,4 +1260,5 @@ def test_loading_policy_with_no_constructor_kwargs_but_required_args(tmp_path: P rasa.shared.utils.io.write_text_file( "{}", tmp_path / PolicyWithoutInitKwargs._metadata_filename() ) - PolicyWithoutInitKwargs.load(str(tmp_path)) + with pytest.warns(FutureWarning): + PolicyWithoutInitKwargs.load(str(tmp_path)) diff --git a/tests/test_train.py b/tests/test_train.py index cb6ac099ca2f..448ad1b7f1f8 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -1,4 +1,5 @@ import sys +import tarfile import tempfile import os from pathlib import Path @@ -571,7 +572,6 @@ def test_model_finetuning_nlu_with_default_epochs( assert new_diet_metadata[EPOCHS] == DIETClassifier.defaults[EPOCHS] * 0.5 -@pytest.mark.parametrize("model_to_fine_tune", ["invalid-path-to-model", "."]) def test_model_finetuning_with_invalid_model( tmp_path: Path, monkeypatch: MonkeyPatch, @@ -579,7 +579,6 @@ def test_model_finetuning_with_invalid_model( default_stories_file: Text, default_stack_config: Text, default_nlu_data: Text, - model_to_fine_tune: Text, capsys: CaptureFixture, ): mocked_nlu_training = AsyncMock(return_value="") @@ -597,7 +596,7 @@ def test_model_finetuning_with_invalid_model( [default_stories_file, default_nlu_data], output=output, force_training=True, - model_to_finetune=model_to_fine_tune, + model_to_finetune="invalid-path-to-model", finetuning_epoch_fraction=1, ) @@ -614,14 +613,12 @@ def test_model_finetuning_with_invalid_model( assert "No NLU model for finetuning found" in output -@pytest.mark.parametrize("model_to_fine_tune", ["invalid-path-to-model", "."]) def test_model_finetuning_with_invalid_model_core( tmp_path: Path, monkeypatch: MonkeyPatch, default_domain_path: Text, default_stories_file: Text, default_stack_config: Text, - model_to_fine_tune: Text, capsys: CaptureFixture, ): mocked_core_training = AsyncMock() @@ -635,7 +632,7 @@ def test_model_finetuning_with_invalid_model_core( default_stack_config, default_stories_file, output=output, - model_to_finetune=model_to_fine_tune, + model_to_finetune="invalid-path-to-model", finetuning_epoch_fraction=1, ) @@ -647,14 +644,12 @@ def test_model_finetuning_with_invalid_model_core( assert "No Core model for finetuning found" in capsys.readouterr().out -@pytest.mark.parametrize("model_to_fine_tune", ["invalid-path-to-model", "."]) def test_model_finetuning_with_invalid_model_nlu( tmp_path: Path, monkeypatch: MonkeyPatch, default_domain_path: Text, default_stack_config: Text, default_nlu_data: Text, - model_to_fine_tune: Text, capsys: CaptureFixture, ): mocked_nlu_training = AsyncMock(return_value="") @@ -668,7 +663,7 @@ def test_model_finetuning_with_invalid_model_nlu( default_nlu_data, domain=default_domain_path, output=output, - model_to_finetune=model_to_fine_tune, + model_to_finetune="invalid-path-to-model", finetuning_epoch_fraction=1, ) From 3cb91b66c90cec9ea3e2df0c00b3ac195d78e524 Mon Sep 17 00:00:00 2001 From: Joe Juzl Date: Tue, 8 Dec 2020 16:41:25 +0100 Subject: [PATCH 25/45] Apply suggestions from code review Co-authored-by: Daksh Varshneya --- rasa/core/policies/ensemble.py | 2 +- rasa/core/policies/policy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rasa/core/policies/ensemble.py b/rasa/core/policies/ensemble.py index 40d9e2192b1e..b1c7e7320ac1 100644 --- a/rasa/core/policies/ensemble.py +++ b/rasa/core/policies/ensemble.py @@ -309,7 +309,7 @@ def _get_updated_epochs( policy_cls: Type[Policy], config_for_policy: Dict[Text, Any], finetuning_epoch_fraction: float, - ): + ) -> Optional[int]: if EPOCHS in config_for_policy: epochs = config_for_policy[EPOCHS] else: diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index 7265b7f9cc59..a91d47298ba5 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -412,7 +412,7 @@ def __init__( you return as they can potentially influence the conversation flow. is_end_to_end_prediction: `True` if the prediction used the text of the user message instead of the intent. - """ + """ self.probabilities = probabilities self.policy_name = policy_name self.policy_priority = (policy_priority,) From 73132f0bdee89f39e711be478bff7ce49310bf43 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Tue, 8 Dec 2020 16:47:57 +0100 Subject: [PATCH 26/45] remove unneeded sklearn epochs --- rasa/core/policies/sklearn_policy.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py index 750f0bc7e763..a87779d2afb6 100644 --- a/rasa/core/policies/sklearn_policy.py +++ b/rasa/core/policies/sklearn_policy.py @@ -324,8 +324,6 @@ def load(cls, path: Union[Text, Path], **kwargs,) -> Policy: zero_state_features = io_utils.pickle_load(zero_features_filename) data = {"should_finetune": kwargs.get("should_finetune", False)} - if "epoch_override" in kwargs: - data[EPOCHS] = kwargs["epoch_override"] policy = cls( featurizer=featurizer, From 28d5fe9f581b8a06fd2a57666cc297163794b632 Mon Sep 17 00:00:00 2001 From: Joe Juzl Date: Tue, 8 Dec 2020 17:09:02 +0100 Subject: [PATCH 27/45] Apply suggestions from code review Co-authored-by: Tobias Wochinger --- rasa/core/policies/ensemble.py | 2 +- rasa/core/policies/sklearn_policy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rasa/core/policies/ensemble.py b/rasa/core/policies/ensemble.py index b1c7e7320ac1..77c15e2355a3 100644 --- a/rasa/core/policies/ensemble.py +++ b/rasa/core/policies/ensemble.py @@ -326,7 +326,7 @@ def load( new_config: Optional[Dict] = None, finetuning_epoch_fraction: float = 1.0, ) -> "PolicyEnsemble": - """Loads policy and domain specification from storage.""" + """Loads policy and domain specification from disk.""" metadata = cls.load_metadata(path) cls.ensure_model_compatibility(metadata) policies = [] diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py index a87779d2afb6..98eb7ad6d7ea 100644 --- a/rasa/core/policies/sklearn_policy.py +++ b/rasa/core/policies/sklearn_policy.py @@ -304,7 +304,7 @@ def persist(self, path: Union[Text, Path]) -> None: @classmethod def load(cls, path: Union[Text, Path], **kwargs,) -> Policy: - """See the docstring for `Policy.Load`.""" + """See the docstring for `Policy.load`.""" filename = Path(path) / "sklearn_model.pkl" zero_features_filename = Path(path) / "zero_state_features.pkl" if not Path(path).exists(): From 0e6ddba611d441af73e1015921682ea8627f9eee Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Tue, 8 Dec 2020 17:11:06 +0100 Subject: [PATCH 28/45] PR comments for warning strings --- rasa/core/policies/ensemble.py | 12 ++++++------ rasa/core/policies/policy.py | 4 ++-- tests/core/test_ensemble.py | 2 +- tests/core/test_policies.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/rasa/core/policies/ensemble.py b/rasa/core/policies/ensemble.py index 77c15e2355a3..dfbce47c3b9e 100644 --- a/rasa/core/policies/ensemble.py +++ b/rasa/core/policies/ensemble.py @@ -349,16 +349,16 @@ def load( if "kwargs" not in rasa.shared.utils.common.arguments_of(policy_cls.load): if context: raise UnsupportedDialogueModelError( - f"`{policy_cls.__name__}.load` does not accept **kwargs. " - f"Attempting to pass {context} to the policy. " - f"This argument should be added to all policies by " + f"`{policy_cls.__name__}.{policy_cls.load.__name__}` does not " + f"accept `**kwargs`. Attempting to pass {context} to the " + f"policy. `**kwargs` should be added to all policies by " f"Rasa Open Source 3.0.0." ) else: rasa.shared.utils.io.raise_deprecation_warning( - f"{policy_cls.__name__} `load` method does not " - f"accept **kwargs. This is required for contextual" - f" information e.g. the flag `should_finetune`.", + f"`{policy_cls.__name__}.{policy_cls.load.__name__}` does not " + f"accept `**kwargs`. `**kwargs` are required for contextual " + f"information e.g. the flag `should_finetune`.", warn_until_version="3.0.0", ) diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index a91d47298ba5..2ba69143f61c 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -303,14 +303,14 @@ def load(cls, path: Union[Text, Path], **kwargs) -> "Policy": if "kwargs" not in constructor_args: if set(data.keys()).issubset(set(constructor_args)): rasa.shared.utils.io.raise_deprecation_warning( - f"{cls.__name__} `__init__` method does not accept **kwargs " + f"`{cls.__name__}.__init__` does not accept `**kwargs` " f"This is required for contextual information e.g. the flag " f"`should_finetune`.", warn_until_version="3.0.0", ) else: raise UnsupportedDialogueModelError( - f"{cls.__name__} `__init__` method does not accept **kwargs. " + f"`{cls.__name__}.__init__` does not accept `**kwargs`. " f"Attempting to pass {data} to the policy. " f"This argument should be added to all policies by " f"Rasa Open Source 3.0.0." diff --git a/tests/core/test_ensemble.py b/tests/core/test_ensemble.py index 02d2825779d2..85d1fb580fe0 100644 --- a/tests/core/test_ensemble.py +++ b/tests/core/test_ensemble.py @@ -94,7 +94,7 @@ def test_policy_loading_no_kwargs_with_context(tmp_path: Path): with pytest.raises(UnsupportedDialogueModelError) as execinfo: PolicyEnsemble.load(str(tmp_path), new_config={"policies": [{}]}) - assert "`PolicyWithoutLoadKwargs.load` does not accept **kwargs" in str( + assert "`PolicyWithoutLoadKwargs.load` does not accept `**kwargs`" in str( execinfo.value ) diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 022bdfe40376..2994b17a2241 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -1251,7 +1251,7 @@ def test_loading_policy_with_no_constructor_kwargs(tmp_path: Path): ) with pytest.raises(UnsupportedDialogueModelError) as execinfo: PolicyWithoutInitKwargs.load(str(tmp_path), should_finetune=True) - assert "PolicyWithoutInitKwargs `__init__` method does not accept **kwargs." in str( + assert "`PolicyWithoutInitKwargs.__init__` does not accept `**kwargs`." in str( execinfo.value ) From 14a6a319d67f7a357f3f491628f22eb0c4bdf1a4 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Tue, 8 Dec 2020 17:20:42 +0100 Subject: [PATCH 29/45] Add typing --- rasa/core/policies/fallback.py | 2 +- rasa/core/policies/form_policy.py | 2 +- rasa/core/policies/mapping_policy.py | 2 +- rasa/core/policies/memoization.py | 2 +- rasa/core/policies/policy.py | 4 ++-- rasa/core/policies/rule_policy.py | 2 +- rasa/core/policies/sklearn_policy.py | 2 +- rasa/core/policies/ted_policy.py | 2 +- rasa/core/policies/two_stage_fallback.py | 2 +- tests/core/test_ensemble.py | 4 ++-- tests/core/test_policies.py | 4 ++-- 11 files changed, 14 insertions(+), 14 deletions(-) diff --git a/rasa/core/policies/fallback.py b/rasa/core/policies/fallback.py index 13331f45840c..9ddb96b4ff84 100644 --- a/rasa/core/policies/fallback.py +++ b/rasa/core/policies/fallback.py @@ -39,7 +39,7 @@ def __init__( ambiguity_threshold: float = DEFAULT_NLU_FALLBACK_AMBIGUITY_THRESHOLD, core_threshold: float = DEFAULT_CORE_FALLBACK_THRESHOLD, fallback_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, - **kwargs, + **kwargs: Any, ) -> None: """Create a new Fallback policy. diff --git a/rasa/core/policies/form_policy.py b/rasa/core/policies/form_policy.py index e2c7f3d6b640..5fcb2f7a2db1 100644 --- a/rasa/core/policies/form_policy.py +++ b/rasa/core/policies/form_policy.py @@ -35,7 +35,7 @@ def __init__( featurizer: Optional[TrackerFeaturizer] = None, priority: int = FORM_POLICY_PRIORITY, lookup: Optional[Dict] = None, - **kwargs, + **kwargs: Any, ) -> None: # max history is set to 2 in order to capture diff --git a/rasa/core/policies/mapping_policy.py b/rasa/core/policies/mapping_policy.py index 594895713524..fac4492a0df7 100644 --- a/rasa/core/policies/mapping_policy.py +++ b/rasa/core/policies/mapping_policy.py @@ -43,7 +43,7 @@ class MappingPolicy(Policy): def _standard_featurizer() -> None: return None - def __init__(self, priority: int = MAPPING_POLICY_PRIORITY, **kwargs,) -> None: + def __init__(self, priority: int = MAPPING_POLICY_PRIORITY, **kwargs: Any) -> None: """Create a new Mapping policy.""" super().__init__(priority=priority, **kwargs) diff --git a/rasa/core/policies/memoization.py b/rasa/core/policies/memoization.py index eded89d74ede..849678e3f4a1 100644 --- a/rasa/core/policies/memoization.py +++ b/rasa/core/policies/memoization.py @@ -71,7 +71,7 @@ def __init__( priority: int = MEMOIZATION_POLICY_PRIORITY, max_history: Optional[int] = MAX_HISTORY_NOT_SET, lookup: Optional[Dict] = None, - **kwargs, + **kwargs: Any, ) -> None: """Initialize the policy. diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index 2ba69143f61c..52b75d7f0e22 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -113,7 +113,7 @@ def __init__( self, featurizer: Optional[TrackerFeaturizer] = None, priority: int = DEFAULT_POLICY_PRIORITY, - **kwargs, + **kwargs: Any, ) -> None: """Constructs a new Policy object.""" self.__featurizer = self._create_featurizer(featurizer) @@ -278,7 +278,7 @@ def persist(self, path: Union[Text, Path]) -> None: rasa.shared.utils.io.dump_obj_as_json_to_file(file, self._metadata()) @classmethod - def load(cls, path: Union[Text, Path], **kwargs) -> "Policy": + def load(cls, path: Union[Text, Path], **kwargs: Any) -> "Policy": """Loads a policy from path. Args: diff --git a/rasa/core/policies/rule_policy.py b/rasa/core/policies/rule_policy.py index 6616ea0dcd4a..033ace294237 100644 --- a/rasa/core/policies/rule_policy.py +++ b/rasa/core/policies/rule_policy.py @@ -108,7 +108,7 @@ def __init__( enable_fallback_prediction: bool = True, restrict_rules: bool = True, check_for_contradictions: bool = True, - **kwargs, + **kwargs: Any, ) -> None: """Create a `RulePolicy` object. diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py index 98eb7ad6d7ea..0f5fc0ad5a50 100644 --- a/rasa/core/policies/sklearn_policy.py +++ b/rasa/core/policies/sklearn_policy.py @@ -303,7 +303,7 @@ def persist(self, path: Union[Text, Path]) -> None: ) @classmethod - def load(cls, path: Union[Text, Path], **kwargs,) -> Policy: + def load(cls, path: Union[Text, Path], **kwargs: Any) -> Policy: """See the docstring for `Policy.load`.""" filename = Path(path) / "sklearn_model.pkl" zero_features_filename = Path(path) / "zero_state_features.pkl" diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index 2442c257ef84..16bc37cd32d6 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -436,7 +436,7 @@ def persist(self, path: Union[Text, Path]) -> None: ) @classmethod - def load(cls, path: Union[Text, Path], **kwargs,) -> "TEDPolicy": + def load(cls, path: Union[Text, Path], **kwargs: Any) -> "TEDPolicy": """Loads a policy from the storage. **Needs to load its featurizer** diff --git a/rasa/core/policies/two_stage_fallback.py b/rasa/core/policies/two_stage_fallback.py index 04335d207ca4..ed3db549dc6f 100644 --- a/rasa/core/policies/two_stage_fallback.py +++ b/rasa/core/policies/two_stage_fallback.py @@ -62,7 +62,7 @@ def __init__( fallback_core_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, fallback_nlu_action_name: Text = ACTION_DEFAULT_FALLBACK_NAME, deny_suggestion_intent_name: Text = USER_INTENT_OUT_OF_SCOPE, - **kwargs, + **kwargs: Any, ) -> None: """Create a new Two-stage Fallback policy. diff --git a/tests/core/test_ensemble.py b/tests/core/test_ensemble.py index 85d1fb580fe0..050ad361c45f 100644 --- a/tests/core/test_ensemble.py +++ b/tests/core/test_ensemble.py @@ -41,7 +41,7 @@ class WorkingPolicy(Policy): @classmethod - def load(cls, *args, **kwargs) -> Policy: + def load(cls, *args: Any, **kwargs: Any) -> Policy: return WorkingPolicy() def persist(self, _) -> None: @@ -118,7 +118,7 @@ def __init__( is_end_to_end_prediction: bool = False, events: Optional[List[Event]] = None, optional_events: Optional[List[Event]] = None, - **kwargs, + **kwargs: Any, ) -> None: super().__init__(priority=priority, **kwargs) self.predict_index = predict_index diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index 2994b17a2241..fe00581ab250 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -1234,14 +1234,14 @@ def test_deprecation_warnings_for_old_rule_like_policies(policy: Type[Policy]): class PolicyWithoutInitKwargs(Policy): - def __init__(self, *args): + def __init__(self, *args: Any) -> None: pass def persist(self, _) -> None: pass @classmethod - def _metadata_filename(cls): + def _metadata_filename(cls) -> Text: return "no_finetune_policy" From 61aefc38ff145e43cfc985c006662871dd26af18 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Tue, 8 Dec 2020 17:38:01 +0100 Subject: [PATCH 30/45] add back invalid model tests --- tests/test_train.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/test_train.py b/tests/test_train.py index 448ad1b7f1f8..cb6ac099ca2f 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -1,5 +1,4 @@ import sys -import tarfile import tempfile import os from pathlib import Path @@ -572,6 +571,7 @@ def test_model_finetuning_nlu_with_default_epochs( assert new_diet_metadata[EPOCHS] == DIETClassifier.defaults[EPOCHS] * 0.5 +@pytest.mark.parametrize("model_to_fine_tune", ["invalid-path-to-model", "."]) def test_model_finetuning_with_invalid_model( tmp_path: Path, monkeypatch: MonkeyPatch, @@ -579,6 +579,7 @@ def test_model_finetuning_with_invalid_model( default_stories_file: Text, default_stack_config: Text, default_nlu_data: Text, + model_to_fine_tune: Text, capsys: CaptureFixture, ): mocked_nlu_training = AsyncMock(return_value="") @@ -596,7 +597,7 @@ def test_model_finetuning_with_invalid_model( [default_stories_file, default_nlu_data], output=output, force_training=True, - model_to_finetune="invalid-path-to-model", + model_to_finetune=model_to_fine_tune, finetuning_epoch_fraction=1, ) @@ -613,12 +614,14 @@ def test_model_finetuning_with_invalid_model( assert "No NLU model for finetuning found" in output +@pytest.mark.parametrize("model_to_fine_tune", ["invalid-path-to-model", "."]) def test_model_finetuning_with_invalid_model_core( tmp_path: Path, monkeypatch: MonkeyPatch, default_domain_path: Text, default_stories_file: Text, default_stack_config: Text, + model_to_fine_tune: Text, capsys: CaptureFixture, ): mocked_core_training = AsyncMock() @@ -632,7 +635,7 @@ def test_model_finetuning_with_invalid_model_core( default_stack_config, default_stories_file, output=output, - model_to_finetune="invalid-path-to-model", + model_to_finetune=model_to_fine_tune, finetuning_epoch_fraction=1, ) @@ -644,12 +647,14 @@ def test_model_finetuning_with_invalid_model_core( assert "No Core model for finetuning found" in capsys.readouterr().out +@pytest.mark.parametrize("model_to_fine_tune", ["invalid-path-to-model", "."]) def test_model_finetuning_with_invalid_model_nlu( tmp_path: Path, monkeypatch: MonkeyPatch, default_domain_path: Text, default_stack_config: Text, default_nlu_data: Text, + model_to_fine_tune: Text, capsys: CaptureFixture, ): mocked_nlu_training = AsyncMock(return_value="") @@ -663,7 +668,7 @@ def test_model_finetuning_with_invalid_model_nlu( default_nlu_data, domain=default_domain_path, output=output, - model_to_finetune="invalid-path-to-model", + model_to_finetune=model_to_fine_tune, finetuning_epoch_fraction=1, ) From d1e029571174c73cd203216293aa8314b4d09145 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Tue, 8 Dec 2020 18:05:06 +0100 Subject: [PATCH 31/45] handle empty sections in config --- rasa/model.py | 2 +- tests/test_model.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/rasa/model.py b/rasa/model.py index 77f86c4eb616..307eac5b0b82 100644 --- a/rasa/model.py +++ b/rasa/model.py @@ -376,7 +376,7 @@ def _get_fingerprint_of_config_without_epochs( copied_config = copy.deepcopy(config) for key in ["pipeline", "policies"]: - if key in copied_config: + if key in copied_config and copied_config[key]: for p in copied_config[key]: if "epochs" in p: del p["epochs"] diff --git a/tests/test_model.py b/tests/test_model.py index 2c6350dd1a63..130b19c7e921 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -372,6 +372,21 @@ async def test_fingerprinting_changing_config_epochs(project: Text, tmp_path): ) +@pytest.mark.parametrize("empty_key", ["pipeline", "policies"]) +async def test_fingerprinting_config_epochs_empty_pipeline_or_policies( + project: Text, tmp_path: Path, empty_key: Text, +): + config = { + "language": "en", + "pipeline": [{"name": "WhitespaceTokenizer"},], + "policies": [{"name": "MemoizationPolicy"},], + } + + config[empty_key] = None + + model._get_fingerprint_of_config_without_epochs(config) + + async def test_fingerprinting_additional_action(project: Text): importer = _project_files(project) From 9ebe706575023816c7594d1b4f8101c7ff8985aa Mon Sep 17 00:00:00 2001 From: Daksh Date: Tue, 8 Dec 2020 18:47:03 +0100 Subject: [PATCH 32/45] review comments --- rasa/nlu/classifiers/diet_classifier.py | 2 +- .../count_vectors_featurizer.py | 5 ---- .../sparse_featurizer/regex_featurizer.py | 30 +++++++++---------- .../test_count_vectors_featurizer.py | 1 - .../nlu/featurizers/test_regex_featurizer.py | 6 ++-- 5 files changed, 18 insertions(+), 26 deletions(-) diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py index 9b5f9a115605..9ffa2b477d0d 100644 --- a/rasa/nlu/classifiers/diet_classifier.py +++ b/rasa/nlu/classifiers/diet_classifier.py @@ -336,7 +336,7 @@ def __init__( self.finetune_mode = finetune_mode if not self.model and self.finetune_mode: - raise rasa.utils.exceptions.RasaException( + raise rasa.shared.exceptions.InvalidParameterException( f"{self.__class__.__name__} was instantiated " f"with `model=None` and `finetune_mode=True`. " f"This is not a valid combination as the component " diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index e6a40dcf37be..bb3cf64c1acf 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -380,7 +380,6 @@ def _update_vectorizer_vocabulary( Args: attribute: Message attribute for which vocabulary should be updated. new_vocabulary: Set of words to expand the vocabulary with if they are unseen. - """ existing_vocabulary: Dict[Text, int] = self.vectorizers[attribute].vocabulary if len(new_vocabulary) > len(existing_vocabulary): @@ -451,7 +450,6 @@ def _add_buffer_to_vocabulary(self, attribute: Text) -> None: Args: attribute: Name of the attribute for which the vocabulary should be expanded. - """ original_vocabulary = self.vectorizers[attribute].vocabulary_ current_vocabulary_size = len(original_vocabulary) @@ -471,7 +469,6 @@ def _set_vocabulary( Args: attribute: Message attribute for which vocabulary should be set original_vocabulary: Vocabulary for the attribute to be set. - """ self.vectorizers[attribute].vocabulary_ = original_vocabulary self.vectorizers[attribute]._validate_vocabulary() @@ -567,7 +564,6 @@ def _fit_loaded_vectorizer( Args: attribute: Message attribute for which the vectorizer is to be trained. attribute_texts: Training texts for the attribute - """ # Get vocabulary words by the preprocessor new_vocabulary = self._construct_vocabulary_from_texts( @@ -584,7 +580,6 @@ def _fit_vectorizer_from_scratch( Args: attribute: Message attribute for which the vectorizer is to be trained. attribute_texts: Training texts for the attribute - """ try: self.vectorizers[attribute].fit(attribute_texts) diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index b8654815ef8c..4d681f5a13fe 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -1,8 +1,7 @@ import logging -import os import re from typing import Any, Dict, List, Optional, Text, Type, Tuple - +from pathlib import Path import numpy as np import scipy.sparse @@ -30,7 +29,6 @@ from rasa.nlu.tokenizers.tokenizer import Tokenizer from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.message import Message -from rasa.shared.exceptions import RasaException from rasa.shared.utils.common import lazy_property logger = logging.getLogger(__name__) @@ -83,7 +81,7 @@ def __init__( if self.finetune_mode and not self.pattern_vocabulary_stats: # If the featurizer is instantiated in finetune mode, # the vocabulary stats for it should be known. - raise RasaException( + raise rasa.shared.exceptions.InvalidParameterException( f"{self.__class__.__name__} was instantiated with" f" `finetune_mode=True` but `pattern_vocabulary_stats`" f" was left to `None`. This is invalid since the featurizer" @@ -308,19 +306,21 @@ def load( """ file_name = meta.get("file") - patterns_file_name = os.path.join(model_dir, file_name + ".patterns.pkl") + patterns_file_name = Path(model_dir).joinpath(file_name + ".patterns.pkl") - vocabulary_stats_file_name = os.path.join( - model_dir, file_name + ".vocabulary_stats.pkl" + vocabulary_stats_file_name = Path(model_dir).joinpath( + file_name + ".vocabulary_stats.pkl" ) known_patterns = None vocabulary_stats = None - if os.path.exists(patterns_file_name): - known_patterns = rasa.shared.utils.io.read_json_file(patterns_file_name) - if os.path.exists(vocabulary_stats_file_name): + if patterns_file_name.exists(): + known_patterns = rasa.shared.utils.io.read_json_file( + str(patterns_file_name) + ) + if vocabulary_stats_file_name.exists(): vocabulary_stats = rasa.shared.utils.io.read_json_file( - vocabulary_stats_file_name + str(vocabulary_stats_file_name) ) return RegexFeaturizer( @@ -341,10 +341,10 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]] Metadata necessary to load the model again. """ patterns_file_name = file_name + ".patterns.pkl" - regex_file = os.path.join(model_dir, patterns_file_name) - utils.write_json_to_file(regex_file, self.known_patterns, indent=4) + regex_file = Path(model_dir).joinpath(patterns_file_name) + utils.write_json_to_file(str(regex_file), self.known_patterns, indent=4) vocabulary_stats_file_name = file_name + ".vocabulary_stats.pkl" - vocabulary_file = os.path.join(model_dir, vocabulary_stats_file_name) - utils.write_json_to_file(vocabulary_file, self.vocabulary_stats, indent=4) + vocabulary_file = Path(model_dir).joinpath(vocabulary_stats_file_name) + utils.write_json_to_file(str(vocabulary_file), self.vocabulary_stats, indent=4) return {"file": file_name} diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index a46acdcb61c0..79c16aebcb6f 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -877,7 +877,6 @@ def test_cvf_incremental_train_vocabulary_overflow( tokenizer.train(data) caplog.set_level(logging.WARNING) - caplog.clear() with caplog.at_level(logging.WARNING): new_featurizer.train(data) if overflow: diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index 5e2a4c31a3ca..c08274f798ef 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -4,7 +4,6 @@ import pytest from pathlib import Path from _pytest.logging import LogCaptureFixture -import os import logging from rasa.shared.nlu.training_data.training_data import TrainingData @@ -402,8 +401,8 @@ def test_persist_load(tmp_path: Path): # Test all artifacts stored as part of persist assert persist_value["file"] == "ftr" - assert os.path.exists(os.path.join(str(tmp_path), "ftr.patterns.pkl")) - assert os.path.exists(os.path.join(str(tmp_path), "ftr.vocabulary_stats.pkl")) + assert tmp_path.joinpath("ftr.patterns.pkl").exists() + assert tmp_path.joinpath("ftr.vocabulary_stats.pkl").exists() assert featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 3, @@ -541,7 +540,6 @@ def test_vocabulary_overflow_log(caplog: LogCaptureFixture): ] caplog.set_level(logging.WARNING) - caplog.clear() with caplog.at_level(logging.WARNING): featurizer.train(TrainingData([], regex_features=additional_patterns)) assert ( From 7c10b09903d4ba620f29bbd58a73c88ec2c3e60e Mon Sep 17 00:00:00 2001 From: Daksh Date: Tue, 8 Dec 2020 23:24:01 +0100 Subject: [PATCH 33/45] make core models finetunable --- rasa/core/policies/policy.py | 4 +-- rasa/core/policies/ted_policy.py | 54 ++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 26 deletions(-) diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index 52b75d7f0e22..69abe903b850 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -118,6 +118,7 @@ def __init__( """Constructs a new Policy object.""" self.__featurizer = self._create_featurizer(featurizer) self.priority = priority + self.finetune_mode = kwargs.pop("should_finetune", False) @property def featurizer(self): @@ -296,8 +297,7 @@ def load(cls, path: Union[Text, Path], **kwargs: Any) -> "Policy": featurizer = TrackerFeaturizer.load(path) data["featurizer"] = featurizer - if "should_finetune" in kwargs: - data["should_finetune"] = kwargs["should_finetune"] + data["should_finetune"] = kwargs.pop("should_finetune", False) constructor_args = rasa.shared.utils.common.arguments_of(cls) if "kwargs" not in constructor_args: diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index 16bc37cd32d6..d47a676d9d54 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -349,12 +349,16 @@ def train( # keep one example for persisting and loading self.data_example = model_data.first_data_example() - self.model = TED( - model_data.get_signature(), - self.config, - isinstance(self.featurizer, MaxHistoryTrackerFeaturizer), - self._label_data, - ) + if not self.finetune_mode: + # This means the model wasn't loaded from a + # previously trained model and hence needs + # to be instantiated. + self.model = TED( + model_data.get_signature(), + self.config, + isinstance(self.featurizer, MaxHistoryTrackerFeaturizer), + self._label_data, + ) self.model.fit( model_data, @@ -476,6 +480,10 @@ def load(cls, path: Union[Text, Path], **kwargs: Any) -> "TEDPolicy": ) meta = train_utils.update_similarity_type(meta) + meta["should_finetune"] = kwargs.get("should_finetune", False) + if "epoch_override" in kwargs: + meta[EPOCHS] = kwargs["epoch_override"] + model = TED.load( str(tf_model_file), model_data_example, @@ -485,24 +493,22 @@ def load(cls, path: Union[Text, Path], **kwargs: Any) -> "TEDPolicy": featurizer, MaxHistoryTrackerFeaturizer ), label_data=label_data, - ) - - # build the graph for prediction - predict_data_example = RasaModelData( - label_key=LABEL_KEY, - label_sub_key=LABEL_SUB_KEY, - data={ - feature_name: features - for feature_name, features in model_data_example.items() - if feature_name - in STATE_LEVEL_FEATURES + FEATURES_TO_ENCODE + [DIALOGUE] - }, - ) - model.build_for_predict(predict_data_example) - - meta["should_finetune"] = kwargs.get("should_finetune", False) - if "epoch_override" in kwargs: - meta[EPOCHS] = kwargs["epoch_override"] + finetune_mode=meta["should_finetune"], + ) + + if not meta["should_finetune"]: + # build the graph for prediction + predict_data_example = RasaModelData( + label_key=LABEL_KEY, + label_sub_key=LABEL_SUB_KEY, + data={ + feature_name: features + for feature_name, features in model_data_example.items() + if feature_name + in STATE_LEVEL_FEATURES + FEATURES_TO_ENCODE + [DIALOGUE] + }, + ) + model.build_for_predict(predict_data_example) return cls( featurizer=featurizer, From bd7927fcedfb628c7ed1f905eaa56c5bfbfe94b2 Mon Sep 17 00:00:00 2001 From: Daksh Date: Wed, 9 Dec 2020 00:46:38 +0100 Subject: [PATCH 34/45] add tests finetuning core policies --- rasa/core/policies/sklearn_policy.py | 2 +- tests/core/policies/test_rule_policy.py | 42 ++++++++++++++ tests/core/test_policies.py | 76 ++++++++++++++++++++++++- 3 files changed, 116 insertions(+), 4 deletions(-) diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py index 0f5fc0ad5a50..d9ad04c40403 100644 --- a/rasa/core/policies/sklearn_policy.py +++ b/rasa/core/policies/sklearn_policy.py @@ -303,7 +303,7 @@ def persist(self, path: Union[Text, Path]) -> None: ) @classmethod - def load(cls, path: Union[Text, Path], **kwargs: Any) -> Policy: + def load(cls, path: Union[Text, Path], **kwargs: Any) -> "SklearnPolicy": """See the docstring for `Policy.load`.""" filename = Path(path) / "sklearn_model.pkl" zero_features_filename = Path(path) / "zero_state_features.pkl" diff --git a/tests/core/policies/test_rule_policy.py b/tests/core/policies/test_rule_policy.py index b8b86a4d67df..836df77ee97f 100644 --- a/tests/core/policies/test_rule_policy.py +++ b/tests/core/policies/test_rule_policy.py @@ -656,6 +656,48 @@ def test_all_policy_attributes_are_persisted(tmpdir: Path): assert persisted_policy._enable_fallback_prediction == enable_fallback_prediction +async def test_rule_policy_finetune( + tmpdir: Path, trained_rule_policy: RulePolicy, trained_rule_policy_domain: Domain +): + + trained_rule_policy.persist(tmpdir) + + loaded_policy = RulePolicy.load(tmpdir, should_finetune=True) + + assert loaded_policy.finetune_mode + + new_rule = TrackerWithCachedStates.from_events( + "stop story", + domain=trained_rule_policy_domain, + slots=trained_rule_policy_domain.slots, + evts=[ + ActionExecuted(RULE_SNIPPET_ACTION_NAME), + ActionExecuted(ACTION_LISTEN_NAME), + UserUttered(intent={"name": "stopp"}), + ActionExecuted("utter_stop"), + ActionExecuted(ACTION_LISTEN_NAME), + ], + ) + new_rule.is_rule_tracker = True + + original_data = await training.load_data( + "examples/rules/data/rules.yml", trained_rule_policy_domain + ) + + loaded_policy.train( + original_data + [new_rule], trained_rule_policy_domain, RegexInterpreter() + ) + + assert ( + len(loaded_policy.lookup["rules"]) + == len(trained_rule_policy.lookup["rules"]) + 1 + ) + assert ( + """[{"prev_action": {"action_name": "action_listen"}, "user": {"intent": "stopp"}}]""" + in loaded_policy.lookup["rules"] + ) + + def test_faq_rule(): domain = Domain.from_yaml( f""" diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py index fe00581ab250..3aa109bd896d 100644 --- a/tests/core/test_policies.py +++ b/tests/core/test_policies.py @@ -32,7 +32,12 @@ USER, ) from rasa.shared.core.domain import State, Domain -from rasa.shared.core.events import ActionExecuted, ConversationPaused, Event +from rasa.shared.core.events import ( + ActionExecuted, + ConversationPaused, + Event, + UserUttered, +) from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer from rasa.core.featurizers.tracker_featurizers import ( MaxHistoryTrackerFeaturizer, @@ -138,11 +143,20 @@ def test_featurizer(self, trained_policy: Policy, tmp_path: Path): assert loaded.featurizer.max_history == self.max_history assert isinstance(loaded.featurizer.state_featurizer, SingleStateFeaturizer) + @pytest.mark.parametrize("should_finetune", [False, True]) async def test_persist_and_load( - self, trained_policy: Policy, default_domain: Domain, tmp_path: Path + self, + trained_policy: Policy, + default_domain: Domain, + tmp_path: Path, + should_finetune: bool, ): trained_policy.persist(str(tmp_path)) - loaded = trained_policy.__class__.load(str(tmp_path)) + loaded = trained_policy.__class__.load( + str(tmp_path), should_finetune=should_finetune + ) + assert loaded.finetune_mode == should_finetune + trackers = await train_trackers(default_domain, augmentation_factor=20) for tracker in trackers: @@ -337,6 +351,24 @@ def test_train_with_shuffle_false( # does not raise policy.train(trackers, domain=default_domain, interpreter=RegexInterpreter()) + def test_finetune_after_load( + self, + trained_policy: SklearnPolicy, + trackers: List[TrackerWithCachedStates], + default_domain: Domain, + tmp_path: Path, + ): + + trained_policy.persist(tmp_path) + + loaded_policy = SklearnPolicy.load(tmp_path, should_finetune=True) + + assert loaded_policy.finetune_mode + + loaded_policy.train(trackers, default_domain, RegexInterpreter()) + + assert loaded_policy.model + class TestTEDPolicy(PolicyTestCollection): def test_train_model_checkpointing(self, tmp_path: Path): @@ -722,6 +754,44 @@ def test_memorise_with_nlu( recalled = trained_policy.recall(states, tracker, default_domain) assert recalled is not None + async def test_finetune_after_load( + self, trained_policy: MemoizationPolicy, default_domain: Domain, tmp_path: Path + ): + + trained_policy.persist(tmp_path) + + loaded_policy = MemoizationPolicy.load(tmp_path, should_finetune=True) + + assert loaded_policy.finetune_mode + + new_story = TrackerWithCachedStates.from_events( + "channel", + domain=default_domain, + slots=default_domain.slots, + evts=[ + ActionExecuted(ACTION_LISTEN_NAME), + UserUttered(intent={"name": "why"}), + ActionExecuted("utter_channel"), + ActionExecuted(ACTION_LISTEN_NAME), + ], + ) + original_train_data = await train_trackers( + default_domain, augmentation_factor=20 + ) + loaded_policy.train( + original_train_data + [new_story], default_domain, RegexInterpreter() + ) + + # Get the hash of the tracker state of new story + new_story_states, _ = loaded_policy.featurizer.training_states_and_actions( + [new_story], default_domain + ) + + # Feature keys for each new state should be present in the lookup + for states in new_story_states: + state_key = loaded_policy._create_feature_key(states) + assert state_key in loaded_policy.lookup + class TestAugmentedMemoizationPolicy(TestMemoizationPolicy): def create_policy( From 64b681ef9e700a30f4bf6ad0dd195ae71bfc1feb Mon Sep 17 00:00:00 2001 From: Daksh Date: Wed, 9 Dec 2020 11:25:57 +0100 Subject: [PATCH 35/45] add print for loaded model --- rasa/train.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/rasa/train.py b/rasa/train.py index 6bd70e8210cd..1dab51e8cd51 100644 --- a/rasa/train.py +++ b/rasa/train.py @@ -500,6 +500,11 @@ def _core_model_for_finetuning( if not path_to_archive: return None + rasa.shared.utils.cli.print_color( + f"Loading model from {path_to_archive} for finetuning...", + color=rasa.shared.utils.io.bcolors.OKBLUE, + ) + with model.unpack_model(path_to_archive) as unpacked: agent = Agent.load( unpacked, @@ -692,6 +697,10 @@ def _nlu_model_for_finetuning( if not path_to_archive: return None + rasa.shared.utils.cli.print_color( + f"Loading model from {path_to_archive} for finetuning...", + color=rasa.shared.utils.io.bcolors.OKBLUE, + ) with model.unpack_model(path_to_archive) as unpacked: _, old_nlu = model.get_model_subdirectories(unpacked) From cf4aaf664e7e6f88528455f1aa216f10547f1444 Mon Sep 17 00:00:00 2001 From: Daksh Date: Wed, 9 Dec 2020 11:45:57 +0100 Subject: [PATCH 36/45] add vocabulary stats logging for cvf --- .../count_vectors_featurizer.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index bb3cf64c1acf..dd5323945b5e 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -520,6 +520,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]) -> N self._fit_vectorizer_from_scratch(TEXT, combined_cleaned_texts) else: self._fit_loaded_vectorizer(TEXT, combined_cleaned_texts) + self._log_vocabulary_stats(TEXT) def _train_with_independent_vocab( self, attribute_texts: Dict[Text, List[Text]] @@ -547,12 +548,29 @@ def _train_with_independent_vocab( ) else: self._fit_loaded_vectorizer(attribute, attribute_texts[attribute]) + + self._log_vocabulary_stats(attribute) else: logger.debug( f"No text provided for {attribute} attribute in any messages of " f"training data. Skipping training a CountVectorizer for it." ) + def _log_vocabulary_stats(self, attribute: Text) -> None: + """Logs number of vocabulary slots filled out of the total number of available slots. + + Args: + attribute: Message attribute for which vocabulary stats are logged. + """ + if attribute in DENSE_FEATURIZABLE_ATTRIBUTES: + attribute_vocabulary = self.vectorizers[attribute].vocabulary_ + first_empty_index = self._get_starting_empty_index(attribute_vocabulary) + logger.info( + f"{first_empty_index - 1} vocabulary slots " + f"consumed out of {len(attribute_vocabulary)} " + f"slots configured for {attribute} attribute" + ) + def _fit_loaded_vectorizer( self, attribute: Text, attribute_texts: List[Text] ) -> None: From 757174d9368ef938ce612ed3b75b7f49b88d4ae1 Mon Sep 17 00:00:00 2001 From: Daksh Date: Wed, 9 Dec 2020 12:43:08 +0100 Subject: [PATCH 37/45] code quality --- rasa/core/policies/sklearn_policy.py | 2 -- rasa/core/policies/ted_policy.py | 10 ++++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py index 04a7459f4e82..3c1340d3e4e2 100644 --- a/rasa/core/policies/sklearn_policy.py +++ b/rasa/core/policies/sklearn_policy.py @@ -325,8 +325,6 @@ def load( meta = json.loads(rasa.shared.utils.io.read_file(meta_file)) zero_state_features = io_utils.pickle_load(zero_features_filename) - data = {"should_finetune": kwargs.get("should_finetune", False)} - policy = cls( featurizer=featurizer, priority=meta["priority"], diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index d47a676d9d54..e786f8c23c3c 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -440,7 +440,9 @@ def persist(self, path: Union[Text, Path]) -> None: ) @classmethod - def load(cls, path: Union[Text, Path], **kwargs: Any) -> "TEDPolicy": + def load( + cls, path: Union[Text, Path], should_finetune: bool = False, **kwargs: Any + ) -> "TEDPolicy": """Loads a policy from the storage. **Needs to load its featurizer** @@ -480,7 +482,7 @@ def load(cls, path: Union[Text, Path], **kwargs: Any) -> "TEDPolicy": ) meta = train_utils.update_similarity_type(meta) - meta["should_finetune"] = kwargs.get("should_finetune", False) + meta["should_finetune"] = should_finetune if "epoch_override" in kwargs: meta[EPOCHS] = kwargs["epoch_override"] @@ -493,10 +495,10 @@ def load(cls, path: Union[Text, Path], **kwargs: Any) -> "TEDPolicy": featurizer, MaxHistoryTrackerFeaturizer ), label_data=label_data, - finetune_mode=meta["should_finetune"], + finetune_mode=should_finetune, ) - if not meta["should_finetune"]: + if not should_finetune: # build the graph for prediction predict_data_example = RasaModelData( label_key=LABEL_KEY, From 68b76dfde3f347f9ed2938beef4ba060f415d02e Mon Sep 17 00:00:00 2001 From: Daksh Date: Wed, 9 Dec 2020 14:26:51 +0100 Subject: [PATCH 38/45] review comments --- .../featurizers/sparse_featurizer/count_vectors_featurizer.py | 4 ++-- rasa/utils/tensorflow/models.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index dd5323945b5e..41502fc5b2f4 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -395,7 +395,7 @@ def _update_vectorizer_vocabulary( def _merge_new_vocabulary_tokens( self, existing_vocabulary: Dict[Text, int], vocabulary: Set[Text] - ): + ) -> None: available_empty_index = self._get_starting_empty_index(existing_vocabulary) for token in vocabulary: if token not in existing_vocabulary: @@ -405,7 +405,7 @@ def _merge_new_vocabulary_tokens( if available_empty_index == len(existing_vocabulary): # We have exhausted all available vocabulary slots. # Drop the remaining vocabulary. - break + return def _get_additional_vocabulary_size( self, attribute: Text, existing_vocabulary_size: int diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py index c1ac9c0fc696..02b374ac4b3c 100644 --- a/rasa/utils/tensorflow/models.py +++ b/rasa/utils/tensorflow/models.py @@ -330,7 +330,7 @@ def load( cls, model_file_name: Text, model_data_example: RasaModelData, - finetune_mode=False, + finetune_mode: bool = False, *args, **kwargs, ) -> "RasaModel": From 6a2d12e5dae625d2a6c66ed70ebf34dc2376bf6c Mon Sep 17 00:00:00 2001 From: Daksh Date: Wed, 9 Dec 2020 16:02:25 +0100 Subject: [PATCH 39/45] reduce number of finetuning epochs in tests --- tests/test_train.py | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/tests/test_train.py b/tests/test_train.py index cb6ac099ca2f..87fa30220cf0 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -383,7 +383,7 @@ def test_model_finetuning( output=output, force_training=True, model_to_finetune=trained_rasa_model, - finetuning_epoch_fraction=1, + finetuning_epoch_fraction=0.1, ) mocked_core_training.assert_called_once() @@ -399,8 +399,6 @@ def test_model_finetuning( def test_model_finetuning_core( tmp_path: Path, monkeypatch: MonkeyPatch, - default_domain_path: Text, - default_nlu_data: Text, trained_moodbot_path: Text, use_latest_model: bool, ): @@ -420,7 +418,7 @@ def test_model_finetuning_core( # from scratch. # Fine-tuning will use the number of epochs in the new config. old_config = rasa.shared.utils.io.read_yaml_file("examples/moodbot/config.yml") - old_config["policies"][0]["epochs"] = 20 + old_config["policies"][0]["epochs"] = 10 new_config_path = tmp_path / "new_config.yml" rasa.shared.utils.io.write_yaml(old_config, new_config_path) @@ -430,7 +428,7 @@ def test_model_finetuning_core( "examples/moodbot/data/stories.yml", output=output, model_to_finetune=trained_moodbot_path, - finetuning_epoch_fraction=0.5, + finetuning_epoch_fraction=0.2, ) mocked_core_training.assert_called_once() @@ -439,16 +437,12 @@ def test_model_finetuning_core( assert isinstance(model_to_finetune, Agent) ted = model_to_finetune.policy_ensemble.policies[0] - assert ted.config[EPOCHS] == 10 + assert ted.config[EPOCHS] == 2 assert ted.config["should_finetune"] is True def test_model_finetuning_core_with_default_epochs( - tmp_path: Path, - monkeypatch: MonkeyPatch, - default_domain_path: Text, - default_nlu_data: Text, - trained_moodbot_path: Text, + tmp_path: Path, monkeypatch: MonkeyPatch, trained_moodbot_path: Text, ): mocked_core_training = AsyncMock() monkeypatch.setattr(rasa.core, rasa.core.train.__name__, mocked_core_training) @@ -484,8 +478,6 @@ def test_model_finetuning_core_with_default_epochs( def test_model_finetuning_nlu( tmp_path: Path, monkeypatch: MonkeyPatch, - default_domain_path: Text, - default_nlu_data: Text, trained_moodbot_path: Text, use_latest_model: bool, ): @@ -517,7 +509,7 @@ def test_model_finetuning_nlu( "examples/moodbot/data/nlu.yml", output=output, model_to_finetune=trained_moodbot_path, - finetuning_epoch_fraction=0.5, + finetuning_epoch_fraction=0.2, ) assert mock_interpreter_create.call_args[1]["should_finetune"] @@ -532,15 +524,11 @@ def test_model_finetuning_nlu( new_diet_metadata = model_to_finetune.model_metadata.metadata["pipeline"][-1] assert new_diet_metadata["name"] == "DIETClassifier" - assert new_diet_metadata[EPOCHS] == 5 + assert new_diet_metadata[EPOCHS] == 2 def test_model_finetuning_nlu_with_default_epochs( - tmp_path: Path, - monkeypatch: MonkeyPatch, - default_domain_path: Text, - default_nlu_data: Text, - trained_moodbot_path: Text, + tmp_path: Path, monkeypatch: MonkeyPatch, trained_moodbot_path: Text, ): mocked_nlu_training = AsyncMock(return_value="") monkeypatch.setattr(rasa.nlu, rasa.nlu.train.__name__, mocked_nlu_training) @@ -560,7 +548,7 @@ def test_model_finetuning_nlu_with_default_epochs( "examples/moodbot/data/nlu.yml", output=output, model_to_finetune=trained_moodbot_path, - finetuning_epoch_fraction=0.5, + finetuning_epoch_fraction=0.1, ) mocked_nlu_training.assert_called_once() @@ -568,7 +556,7 @@ def test_model_finetuning_nlu_with_default_epochs( model_to_finetune = nlu_train_kwargs["model_to_finetune"] new_diet_metadata = model_to_finetune.model_metadata.metadata["pipeline"][-1] assert new_diet_metadata["name"] == "DIETClassifier" - assert new_diet_metadata[EPOCHS] == DIETClassifier.defaults[EPOCHS] * 0.5 + assert new_diet_metadata[EPOCHS] == DIETClassifier.defaults[EPOCHS] * 0.1 @pytest.mark.parametrize("model_to_fine_tune", ["invalid-path-to-model", "."]) From 90b5394d49d777bb6cae3f3c18269cdd11bbadf5 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Wed, 9 Dec 2020 17:47:35 +0100 Subject: [PATCH 40/45] Use fingerprinting for finetuning and add more tests --- rasa/model.py | 36 ++-- .../shared/nlu/training_data/training_data.py | 8 + rasa/train.py | 56 +++-- tests/test_model.py | 45 ++-- tests/test_train.py | 192 ++++++++++++------ 5 files changed, 224 insertions(+), 113 deletions(-) diff --git a/rasa/model.py b/rasa/model.py index 77f86c4eb616..9a5a8eb4c517 100644 --- a/rasa/model.py +++ b/rasa/model.py @@ -50,6 +50,7 @@ FINGERPRINT_RASA_VERSION_KEY = "version" FINGERPRINT_STORIES_KEY = "stories" FINGERPRINT_NLU_DATA_KEY = "messages" +FINGERPRINT_NLU_LABELS_KEY = "nlu_labels" FINGERPRINT_PROJECT = "project" FINGERPRINT_TRAINED_AT_KEY = "trained_at" @@ -82,14 +83,6 @@ class Section(NamedTuple): ) SECTION_NLG = Section(name="NLG templates", relevant_keys=[FINGERPRINT_NLG_KEY]) -SECTION_FINE_TUNE = Section( - name="Fine-tune", - relevant_keys=[ - FINGERPRINT_CONFIG_WITHOUT_EPOCHS_KEY, - FINGERPRINT_DOMAIN_WITHOUT_NLG_KEY, - ], -) - class FingerprintComparisonResult: """Container for the results of a fingerprint comparison.""" @@ -346,6 +339,7 @@ async def model_fingerprint(file_importer: "TrainingDataImporter") -> Fingerprin FINGERPRINT_NLG_KEY: rasa.shared.utils.io.deep_container_fingerprint(responses), FINGERPRINT_PROJECT: project_fingerprint(), FINGERPRINT_NLU_DATA_KEY: nlu_data.fingerprint(), + FINGERPRINT_NLU_LABELS_KEY: nlu_data.label_fingerprint(), FINGERPRINT_STORIES_KEY: stories.fingerprint(), FINGERPRINT_TRAINED_AT_KEY: time.time(), FINGERPRINT_RASA_VERSION_KEY: rasa.__version__, @@ -376,7 +370,7 @@ def _get_fingerprint_of_config_without_epochs( copied_config = copy.deepcopy(config) for key in ["pipeline", "policies"]: - if key in copied_config: + if key in copied_config and copied_config[key]: for p in copied_config[key]: if "epochs" in p: del p["epochs"] @@ -500,18 +494,35 @@ def should_retrain( return fingerprint_comparison -def can_fine_tune(last_fingerprint: Fingerprint, new_fingerprint: Fingerprint) -> bool: +def can_finetune( + last_fingerprint: Fingerprint, + new_fingerprint: Fingerprint, + core: bool = False, + nlu: bool = False, +) -> bool: """Checks if components of a model can be finetuned with incremental training. Args: last_fingerprint: The fingerprint of the old model to potentially be fine-tuned. new_fingerprint: The fingerprint of the new model. + core: Check sections for finetuning a core model. + nlu: Check sections for finetuning an nlu model. Returns: - `True` if the old model can be fine-tuned, `False` otherwise. + `True` if the old model can be finetuned, `False` otherwise. """ + section_keys = [ + FINGERPRINT_CONFIG_WITHOUT_EPOCHS_KEY, + ] + if core: + section_keys.append(FINGERPRINT_DOMAIN_WITHOUT_NLG_KEY) + if nlu: + section_keys.append(FINGERPRINT_NLU_LABELS_KEY) + fingerprint_changed = did_section_fingerprint_change( - last_fingerprint, new_fingerprint, SECTION_FINE_TUNE + last_fingerprint, + new_fingerprint, + Section(name="finetune", relevant_keys=section_keys), ) old_model_above_min_version = version.parse( @@ -562,7 +573,6 @@ async def update_model_with_new_domain( importer: Importer which provides the new domain. unpacked_model_path: Path to the unpacked model. """ - model_path = Path(unpacked_model_path) / DEFAULT_CORE_SUBDIRECTORY_NAME domain = await importer.get_domain() diff --git a/rasa/shared/nlu/training_data/training_data.py b/rasa/shared/nlu/training_data/training_data.py index 268e05340fc5..4206fcd21847 100644 --- a/rasa/shared/nlu/training_data/training_data.py +++ b/rasa/shared/nlu/training_data/training_data.py @@ -79,6 +79,14 @@ def fingerprint(self) -> Text: return rasa.shared.utils.io.deep_container_fingerprint(relevant_attributes) + def label_fingerprint(self) -> Text: + """Fingerprint the labels in the training data. + + Returns: + hex string as a fingerprint of the training data labels. + """ + return rasa.shared.utils.io.deep_container_fingerprint(self.intents) + def merge(self, *others: Optional["TrainingData"]) -> "TrainingData": """Return merged instance of this data with other training data. diff --git a/rasa/train.py b/rasa/train.py index 6bd70e8210cd..ab891358671d 100644 --- a/rasa/train.py +++ b/rasa/train.py @@ -2,13 +2,13 @@ import os import tempfile from contextlib import ExitStack -from typing import Any, Text, Optional, List, Union, Dict, TYPE_CHECKING +from typing import Any, Text, Optional, List, Union, Dict import rasa.core.interpreter from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter from rasa.shared.importers.importer import TrainingDataImporter from rasa import model, telemetry -from rasa.model import FingerprintComparisonResult +from rasa.model import Fingerprint, FingerprintComparisonResult from rasa.shared.core.domain import Domain from rasa.nlu.model import Interpreter import rasa.utils.common @@ -446,14 +446,14 @@ async def _train_core_with_validated_data( ) if model_to_finetune: - model_to_finetune = _core_model_for_finetuning( + model_to_finetune = await _core_model_for_finetuning( model_to_finetune, - new_config=config, + file_importer=file_importer, finetuning_epoch_fraction=finetuning_epoch_fraction, ) if not model_to_finetune: - rasa.shared.utils.cli.print_warning( + rasa.shared.utils.cli.print_error_and_exit( f"No Core model for finetuning found. Please make sure to either " f"specify a path to a previous model or to have a finetunable " f"model within the directory '{output}'." @@ -491,9 +491,9 @@ async def _train_core_with_validated_data( return _train_path -def _core_model_for_finetuning( +async def _core_model_for_finetuning( model_to_finetune: Text, - new_config: Optional[Dict] = None, + file_importer: TrainingDataImporter, finetuning_epoch_fraction: float = 1.0, ) -> Optional[Agent]: path_to_archive = model.get_model_for_finetuning(model_to_finetune) @@ -501,9 +501,17 @@ def _core_model_for_finetuning( return None with model.unpack_model(path_to_archive) as unpacked: + new_fingerprint = await model.model_fingerprint(file_importer) + old_fingerprint = model.fingerprint_from_path(unpacked) + if not model.can_finetune(old_fingerprint, new_fingerprint, core=True): + rasa.shared.utils.cli.print_error_and_exit( + "NLU model can not be finetuned." + ) + + config = await file_importer.get_config() agent = Agent.load( unpacked, - new_config=new_config, + new_config=config, finetuning_epoch_fraction=finetuning_epoch_fraction, ) # Agent might be empty if no underlying Core model was found. @@ -639,12 +647,14 @@ async def _train_nlu_with_validated_data( ) if model_to_finetune: - model_to_finetune = _nlu_model_for_finetuning( - model_to_finetune, config, finetuning_epoch_fraction + model_to_finetune = await _nlu_model_for_finetuning( + model_to_finetune, + file_importer, + finetuning_epoch_fraction, + also_train_core=train_path is not None, ) - if not model_to_finetune: - rasa.shared.utils.cli.print_warning( + rasa.shared.utils.cli.print_error_and_exit( f"No NLU model for finetuning found. Please make sure to either " f"specify a path to a previous model or to have a finetunable " f"model within the directory '{output}'." @@ -683,20 +693,34 @@ async def _train_nlu_with_validated_data( return _train_path -def _nlu_model_for_finetuning( +async def _nlu_model_for_finetuning( model_to_finetune: Text, - new_config: Dict[Text, Any], + file_importer: TrainingDataImporter, finetuning_epoch_fraction: float = 1.0, + also_train_core: bool = False, ) -> Optional[Interpreter]: + path_to_archive = model.get_model_for_finetuning(model_to_finetune) if not path_to_archive: return None with model.unpack_model(path_to_archive) as unpacked: _, old_nlu = model.get_model_subdirectories(unpacked) + new_fingerprint = await model.model_fingerprint(file_importer) + old_fingerprint = model.fingerprint_from_path(unpacked) + if not model.can_finetune( + old_fingerprint, new_fingerprint, nlu=True, core=also_train_core + ): + rasa.shared.utils.cli.print_error_and_exit( + "NLU model can not be finetuned." + ) - return Interpreter.load( + config = await file_importer.get_config() + model_to_finetune = Interpreter.load( old_nlu, - new_config=new_config, + new_config=config, finetuning_epoch_fraction=finetuning_epoch_fraction, ) + if not model_to_finetune: + return None + return model_to_finetune diff --git a/tests/test_model.py b/tests/test_model.py index 2c6350dd1a63..2114cfaa74d3 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -37,9 +37,8 @@ FINGERPRINT_CONFIG_CORE_KEY, FINGERPRINT_CONFIG_NLU_KEY, SECTION_CORE, - SECTION_FINE_TUNE, SECTION_NLU, - can_fine_tune, + can_finetune, create_package_rasa, get_latest_model, get_model, @@ -51,7 +50,6 @@ FingerprintComparisonResult, ) from rasa.exceptions import ModelNotFound -from tests.conftest import AsyncMock from tests.core.conftest import DEFAULT_DOMAIN_PATH_WITH_MAPPING @@ -192,28 +190,6 @@ def test_nlu_fingerprint_changed(fingerprint2: Fingerprint, changed: bool): ) -@pytest.mark.parametrize( - "fingerprint2, changed", - [ - (_fingerprint(config_without_epochs=["other"]), True), - (_fingerprint(domain=["other"]), True), - (_fingerprint(config=["other"]), False), - (_fingerprint(rasa_version="100"), False), - (_fingerprint(config_core=["other"]), False), - (_fingerprint(config_nlu=["other"]), False), - (_fingerprint(nlu=["other"]), False), - (_fingerprint(nlg=["other"]), False), - (_fingerprint(stories=["other"]), False), - ], -) -def test_fine_tune_fingerprint_changed(fingerprint2: Fingerprint, changed: bool): - fingerprint1 = _fingerprint() - assert ( - did_section_fingerprint_change(fingerprint1, fingerprint2, SECTION_FINE_TUNE) - is changed - ) - - def _project_files( project: Text, config_file: Text = DEFAULT_CONFIG_PATH, @@ -563,7 +539,7 @@ async def get_domain() -> Domain: "min_compatible_version, old_model_version, can_tune", [("2.1.0", "2.1.0", True), ("2.0.0", "2.1.0", True), ("2.1.0", "2.0.0", False),], ) -async def test_can_fine_tune_min_version( +async def test_can_finetune_min_version( project: Text, monkeypatch: MonkeyPatch, old_model_version: Text, @@ -579,4 +555,19 @@ async def test_can_fine_tune_min_version( old_fingerprint = await model_fingerprint(importer) new_fingerprint = await model_fingerprint(importer) - assert can_fine_tune(old_fingerprint, new_fingerprint) == can_tune + assert can_finetune(old_fingerprint, new_fingerprint) == can_tune + + +@pytest.mark.parametrize("empty_key", ["pipeline", "policies"]) +async def test_fingerprinting_config_epochs_empty_pipeline_or_policies( + project: Text, tmp_path: Path, empty_key: Text, +): + config = { + "language": "en", + "pipeline": [{"name": "WhitespaceTokenizer"},], + "policies": [{"name": "MemoizationPolicy"},], + } + + config[empty_key] = None + + model._get_fingerprint_of_config_without_epochs(config) diff --git a/tests/test_train.py b/tests/test_train.py index cb6ac099ca2f..a4ea7b2fee51 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -399,8 +399,6 @@ def test_model_finetuning( def test_model_finetuning_core( tmp_path: Path, monkeypatch: MonkeyPatch, - default_domain_path: Text, - default_nlu_data: Text, trained_moodbot_path: Text, use_latest_model: bool, ): @@ -424,10 +422,19 @@ def test_model_finetuning_core( new_config_path = tmp_path / "new_config.yml" rasa.shared.utils.io.write_yaml(old_config, new_config_path) + old_stories = rasa.shared.utils.io.read_yaml_file( + "examples/moodbot/data/stories.yml" + ) + old_stories["stories"].append( + {"story": "new story", "steps": [{"intent": "greet"}]} + ) + new_stories_path = tmp_path / "new_stories.yml" + rasa.shared.utils.io.write_yaml(old_stories, new_stories_path) + train_core( "examples/moodbot/domain.yml", str(new_config_path), - "examples/moodbot/data/stories.yml", + str(new_stories_path), output=output, model_to_finetune=trained_moodbot_path, finetuning_epoch_fraction=0.5, @@ -444,11 +451,7 @@ def test_model_finetuning_core( def test_model_finetuning_core_with_default_epochs( - tmp_path: Path, - monkeypatch: MonkeyPatch, - default_domain_path: Text, - default_nlu_data: Text, - trained_moodbot_path: Text, + tmp_path: Path, monkeypatch: MonkeyPatch, trained_moodbot_path: Text, ): mocked_core_training = AsyncMock() monkeypatch.setattr(rasa.core, rasa.core.train.__name__, mocked_core_training) @@ -480,12 +483,68 @@ def test_model_finetuning_core_with_default_epochs( assert ted.config[EPOCHS] == TEDPolicy.defaults[EPOCHS] * 2 +def test_model_finetuning_core_new_domain_label( + tmp_path: Path, monkeypatch: MonkeyPatch, trained_moodbot_path: Text, +): + mocked_core_training = AsyncMock() + monkeypatch.setattr(rasa.core, rasa.core.train.__name__, mocked_core_training) + + (tmp_path / "models").mkdir() + output = str(tmp_path / "models") + + old_domain = rasa.shared.utils.io.read_yaml_file("examples/moodbot/domain.yml") + old_domain["intents"].append("a_new_one") + new_domain_path = tmp_path / "new_domain.yml" + rasa.shared.utils.io.write_yaml(old_domain, new_domain_path) + + with pytest.raises(SystemExit): + train_core( + domain=str(new_domain_path), + config="examples/moodbot/config.yml", + stories="examples/moodbot/data/stories.yml", + output=output, + model_to_finetune=trained_moodbot_path, + ) + + mocked_core_training.assert_not_called() + + +def test_model_finetuning_new_domain_label_stops_all_training( + tmp_path: Path, monkeypatch: MonkeyPatch, trained_moodbot_path: Text, +): + mocked_core_training = AsyncMock() + mocked_nlu_training = AsyncMock() + monkeypatch.setattr(rasa.core, rasa.core.train.__name__, mocked_core_training) + monkeypatch.setattr(rasa.nlu, rasa.nlu.train.__name__, mocked_nlu_training) + + (tmp_path / "models").mkdir() + output = str(tmp_path / "models") + + old_domain = rasa.shared.utils.io.read_yaml_file("examples/moodbot/domain.yml") + old_domain["intents"].append("a_new_one") + new_domain_path = tmp_path / "new_domain.yml" + rasa.shared.utils.io.write_yaml(old_domain, new_domain_path) + + with pytest.raises(SystemExit): + train( + domain=str(new_domain_path), + config="examples/moodbot/config.yml", + training_files=[ + "examples/moodbot/data/stories.yml", + "examples/moodbot/data/nlu.yml", + ], + output=output, + model_to_finetune=trained_moodbot_path, + ) + + mocked_core_training.assert_not_called() + mocked_nlu_training.assert_not_called() + + @pytest.mark.parametrize("use_latest_model", [True, False]) def test_model_finetuning_nlu( tmp_path: Path, monkeypatch: MonkeyPatch, - default_domain_path: Text, - default_nlu_data: Text, trained_moodbot_path: Text, use_latest_model: bool, ): @@ -512,9 +571,15 @@ def test_model_finetuning_nlu( new_config_path = tmp_path / "new_config.yml" rasa.shared.utils.io.write_yaml(old_config, new_config_path) + old_nlu = rasa.shared.utils.io.read_yaml_file("examples/moodbot/data/nlu.yml") + old_nlu["nlu"][-1]["examples"] = "-something else" + new_nlu_path = tmp_path / "new_nlu.yml" + rasa.shared.utils.io.write_yaml(old_nlu, new_nlu_path) + train_nlu( str(new_config_path), - "examples/moodbot/data/nlu.yml", + str(new_nlu_path), + domain="examples/moodbot/domain.yml", output=output, model_to_finetune=trained_moodbot_path, finetuning_epoch_fraction=0.5, @@ -535,12 +600,34 @@ def test_model_finetuning_nlu( assert new_diet_metadata[EPOCHS] == 5 +def test_model_finetuning_nlu_new_label( + tmp_path: Path, monkeypatch: MonkeyPatch, trained_moodbot_path: Text, +): + mocked_nlu_training = AsyncMock(return_value="") + monkeypatch.setattr(rasa.nlu, rasa.nlu.train.__name__, mocked_nlu_training) + + (tmp_path / "models").mkdir() + output = str(tmp_path / "models") + + old_nlu = rasa.shared.utils.io.read_yaml_file("examples/moodbot/data/nlu.yml") + old_nlu["nlu"].append({"intent": "a_new_one", "examples": "-blah"}) + new_nlu_path = tmp_path / "new_nlu.yml" + rasa.shared.utils.io.write_yaml(old_nlu, new_nlu_path) + + with pytest.raises(SystemExit): + train_nlu( + "examples/moodbot/config.yml", + str(new_nlu_path), + domain="examples/moodbot/domain.yml", + output=output, + model_to_finetune=trained_moodbot_path, + ) + + mocked_nlu_training.assert_not_called() + + def test_model_finetuning_nlu_with_default_epochs( - tmp_path: Path, - monkeypatch: MonkeyPatch, - default_domain_path: Text, - default_nlu_data: Text, - trained_moodbot_path: Text, + tmp_path: Path, monkeypatch: MonkeyPatch, trained_moodbot_path: Text, ): mocked_nlu_training = AsyncMock(return_value="") monkeypatch.setattr(rasa.nlu, rasa.nlu.train.__name__, mocked_nlu_training) @@ -591,26 +678,21 @@ def test_model_finetuning_with_invalid_model( (tmp_path / "models").mkdir() output = str(tmp_path / "models") - train( - default_domain_path, - default_stack_config, - [default_stories_file, default_nlu_data], - output=output, - force_training=True, - model_to_finetune=model_to_fine_tune, - finetuning_epoch_fraction=1, - ) - - mocked_core_training.assert_called_once() - _, kwargs = mocked_core_training.call_args - assert kwargs["model_to_finetune"] is None + with pytest.raises(SystemExit): + train( + default_domain_path, + default_stack_config, + [default_stories_file, default_nlu_data], + output=output, + force_training=True, + model_to_finetune=model_to_fine_tune, + finetuning_epoch_fraction=1, + ) - mocked_nlu_training.assert_called_once() - _, kwargs = mocked_nlu_training.call_args - assert kwargs["model_to_finetune"] is None + mocked_core_training.assert_not_called() + mocked_nlu_training.assert_not_called() output = capsys.readouterr().out - assert "No Core model for finetuning found" in output assert "No NLU model for finetuning found" in output @@ -630,19 +712,17 @@ def test_model_finetuning_with_invalid_model_core( (tmp_path / "models").mkdir() output = str(tmp_path / "models") - train_core( - default_domain_path, - default_stack_config, - default_stories_file, - output=output, - model_to_finetune=model_to_fine_tune, - finetuning_epoch_fraction=1, - ) - - mocked_core_training.assert_called_once() + with pytest.raises(SystemExit): + train_core( + default_domain_path, + default_stack_config, + default_stories_file, + output=output, + model_to_finetune=model_to_fine_tune, + finetuning_epoch_fraction=1, + ) - _, kwargs = mocked_core_training.call_args - assert kwargs["model_to_finetune"] is None + mocked_core_training.assert_not_called() assert "No Core model for finetuning found" in capsys.readouterr().out @@ -663,18 +743,16 @@ def test_model_finetuning_with_invalid_model_nlu( (tmp_path / "models").mkdir() output = str(tmp_path / "models") - train_nlu( - default_stack_config, - default_nlu_data, - domain=default_domain_path, - output=output, - model_to_finetune=model_to_fine_tune, - finetuning_epoch_fraction=1, - ) - - mocked_nlu_training.assert_called_once() + with pytest.raises(SystemExit): + train_nlu( + default_stack_config, + default_nlu_data, + domain=default_domain_path, + output=output, + model_to_finetune=model_to_fine_tune, + finetuning_epoch_fraction=1, + ) - _, kwargs = mocked_nlu_training.call_args - assert kwargs["model_to_finetune"] is None + mocked_nlu_training.assert_not_called() assert "No NLU model for finetuning found" in capsys.readouterr().out From 5721fe64e8ea553f186ddd0ed975cd39c5f47c5b Mon Sep 17 00:00:00 2001 From: Daksh Date: Wed, 9 Dec 2020 18:20:39 +0100 Subject: [PATCH 41/45] review comments --- .../count_vectors_featurizer.py | 24 +++++++++---------- .../sparse_featurizer/regex_featurizer.py | 24 +++++++++---------- rasa/train.py | 10 ++++---- rasa/utils/tensorflow/models.py | 4 +++- tests/core/policies/test_rule_policy.py | 8 +++---- tests/nlu/classifiers/test_diet_classifier.py | 10 ++++---- .../test_count_vectors_featurizer.py | 1 - .../nlu/featurizers/test_regex_featurizer.py | 10 +++----- 8 files changed, 42 insertions(+), 49 deletions(-) diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py index 41502fc5b2f4..ca6ade583f9b 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py @@ -89,7 +89,7 @@ def required_components(cls) -> List[Type[Component]]: # indicates whether the featurizer should use the lemma of a word for # counting (if available) or not "use_lemma": True, - # Additional vocabulary size to be kept reserved + # Additional vocabulary size to be kept reserved for finetuning "additional_vocabulary_size": {TEXT: None, RESPONSE: None, ACTION_TEXT: None}, } @@ -131,7 +131,6 @@ def _load_count_vect_params(self) -> None: # use the lemma of the words or not self.use_lemma = self.component_config["use_lemma"] - # noinspection PyPep8Naming def _load_vocabulary_params(self) -> None: self.OOV_token = self.component_config["OOV_token"] @@ -157,11 +156,10 @@ def _load_vocabulary_params(self) -> None: if isinstance(self.additional_vocabulary_size, int): # User defined a common value for all attributes user_defined_additional_size = self.additional_vocabulary_size - self.additional_vocabulary_size = {} - for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: - self.additional_vocabulary_size[ - attribute - ] = user_defined_additional_size + self.additional_vocabulary_size = { + attribute: user_defined_additional_size + for attribute in DENSE_FEATURIZABLE_ATTRIBUTES + } def _check_attribute_vocabulary(self, attribute: Text) -> bool: """Checks if trained vocabulary exists in attribute's count vectorizer.""" @@ -373,7 +371,7 @@ def _get_starting_empty_index(vocabulary: Dict[Text, int]) -> int: def _update_vectorizer_vocabulary( self, attribute: Text, new_vocabulary: Set[Text] ) -> None: - """Update the existing vocabulary of the vectorizer with new unseen words. + """Updates the existing vocabulary of the vectorizer with new unseen words. These unseen words should only occupy the empty buffer slots. @@ -383,7 +381,7 @@ def _update_vectorizer_vocabulary( """ existing_vocabulary: Dict[Text, int] = self.vectorizers[attribute].vocabulary if len(new_vocabulary) > len(existing_vocabulary): - logger.warning( + rasa.shared.utils.io.raise_warning( f"New data contains vocabulary of size {len(new_vocabulary)} for attribute {attribute} " f"which is larger than the maximum vocabulary size({len(existing_vocabulary)}) " f"of the original model. Some tokens will have to be dropped " @@ -431,8 +429,10 @@ def _get_additional_vocabulary_size( # of new/existing labels(intents, actions, etc.) if attribute not in DENSE_FEATURIZABLE_ATTRIBUTES: return 0 - if self.additional_vocabulary_size.get(attribute) is not None: - return self.additional_vocabulary_size[attribute] + + configured_additional_size = self.additional_vocabulary_size.get(attribute) + if configured_additional_size is not None: + return configured_additional_size # If the user hasn't defined additional vocabulary size, # then we increase it by 1000 minimum. If the current @@ -568,7 +568,7 @@ def _log_vocabulary_stats(self, attribute: Text) -> None: logger.info( f"{first_empty_index - 1} vocabulary slots " f"consumed out of {len(attribute_vocabulary)} " - f"slots configured for {attribute} attribute" + f"slots configured for {attribute} attribute." ) def _fit_loaded_vectorizer( diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 4d681f5a13fe..8563caf69283 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -109,7 +109,7 @@ def vocabulary_stats(self) -> Dict[Text, int]: ) return self.pattern_vocabulary_stats - def _merge_new_patterns(self, new_patterns: List[Dict[Text, Text]]): + def _merge_new_patterns(self, new_patterns: List[Dict[Text, Text]]) -> None: """Updates already known patterns with new patterns extracted from data. Args: @@ -136,7 +136,7 @@ def _merge_new_patterns(self, new_patterns: List[Dict[Text, Text]]): continue self.known_patterns.append(extra_pattern) if patterns_dropped: - logger.warning( + rasa.shared.utils.io.raise_warning( f"The originally trained model was configured to " f"handle a maximum number of {max_number_patterns} patterns. " f"The current training data exceeds this number as " @@ -306,21 +306,19 @@ def load( """ file_name = meta.get("file") - patterns_file_name = Path(model_dir).joinpath(file_name + ".patterns.pkl") + patterns_file_name = Path(model_dir) / file_name + ".patterns.pkl" - vocabulary_stats_file_name = Path(model_dir).joinpath( - file_name + ".vocabulary_stats.pkl" + vocabulary_stats_file_name = ( + Path(model_dir) / file_name + ".vocabulary_stats.pkl" ) known_patterns = None vocabulary_stats = None if patterns_file_name.exists(): - known_patterns = rasa.shared.utils.io.read_json_file( - str(patterns_file_name) - ) + known_patterns = rasa.shared.utils.io.read_json_file(patterns_file_name) if vocabulary_stats_file_name.exists(): vocabulary_stats = rasa.shared.utils.io.read_json_file( - str(vocabulary_stats_file_name) + vocabulary_stats_file_name ) return RegexFeaturizer( @@ -341,10 +339,10 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]] Metadata necessary to load the model again. """ patterns_file_name = file_name + ".patterns.pkl" - regex_file = Path(model_dir).joinpath(patterns_file_name) - utils.write_json_to_file(str(regex_file), self.known_patterns, indent=4) + regex_file = Path(model_dir) / patterns_file_name + utils.write_json_to_file(regex_file, self.known_patterns, indent=4) vocabulary_stats_file_name = file_name + ".vocabulary_stats.pkl" - vocabulary_file = Path(model_dir).joinpath(vocabulary_stats_file_name) - utils.write_json_to_file(str(vocabulary_file), self.vocabulary_stats, indent=4) + vocabulary_file = Path(model_dir) / vocabulary_stats_file_name + utils.write_json_to_file(vocabulary_file, self.vocabulary_stats, indent=4) return {"file": file_name} diff --git a/rasa/train.py b/rasa/train.py index 1dab51e8cd51..221398021ab1 100644 --- a/rasa/train.py +++ b/rasa/train.py @@ -500,9 +500,8 @@ def _core_model_for_finetuning( if not path_to_archive: return None - rasa.shared.utils.cli.print_color( - f"Loading model from {path_to_archive} for finetuning...", - color=rasa.shared.utils.io.bcolors.OKBLUE, + rasa.shared.utils.cli.print_info( + f"Loading Core model from {path_to_archive} for finetuning...", ) with model.unpack_model(path_to_archive) as unpacked: @@ -697,9 +696,8 @@ def _nlu_model_for_finetuning( if not path_to_archive: return None - rasa.shared.utils.cli.print_color( - f"Loading model from {path_to_archive} for finetuning...", - color=rasa.shared.utils.io.bcolors.OKBLUE, + rasa.shared.utils.cli.print_info( + f"Loading NLU model from {path_to_archive} for finetuning...", ) with model.unpack_model(path_to_archive) as unpacked: _, old_nlu = model.get_model_subdirectories(unpacked) diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py index 02b374ac4b3c..3fa10c590a1e 100644 --- a/rasa/utils/tensorflow/models.py +++ b/rasa/utils/tensorflow/models.py @@ -346,7 +346,9 @@ def load( Returns: Loaded model with weights appropriately set. """ - logger.debug(f"Loading the model with finetune_mode={finetune_mode}...") + logger.debug( + f"Loading the model from {model_file_name} with finetune_mode={finetune_mode}..." + ) # create empty model model = cls(*args, **kwargs) # need to train on 1 example to build weights of the correct size diff --git a/tests/core/policies/test_rule_policy.py b/tests/core/policies/test_rule_policy.py index 836df77ee97f..98116619dfda 100644 --- a/tests/core/policies/test_rule_policy.py +++ b/tests/core/policies/test_rule_policy.py @@ -657,12 +657,12 @@ def test_all_policy_attributes_are_persisted(tmpdir: Path): async def test_rule_policy_finetune( - tmpdir: Path, trained_rule_policy: RulePolicy, trained_rule_policy_domain: Domain + tmp_path: Path, trained_rule_policy: RulePolicy, trained_rule_policy_domain: Domain ): - trained_rule_policy.persist(tmpdir) + trained_rule_policy.persist(tmp_path) - loaded_policy = RulePolicy.load(tmpdir, should_finetune=True) + loaded_policy = RulePolicy.load(tmp_path, should_finetune=True) assert loaded_policy.finetune_mode @@ -677,8 +677,8 @@ async def test_rule_policy_finetune( ActionExecuted("utter_stop"), ActionExecuted(ACTION_LISTEN_NAME), ], + is_rule_tracker=True, ) - new_rule.is_rule_tracker = True original_data = await training.load_data( "examples/rules/data/rules.yml", trained_rule_policy_domain diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py index 9dc938f4ab58..6f23f613fe59 100644 --- a/tests/nlu/classifiers/test_diet_classifier.py +++ b/tests/nlu/classifiers/test_diet_classifier.py @@ -138,7 +138,7 @@ async def _train_persist_load_with_different_settings( @pytest.mark.skip_on_windows async def test_train_persist_load_with_different_settings_non_windows( - component_builder: ComponentBuilder, tmpdir: Path + component_builder: ComponentBuilder, tmp_path: Path ): pipeline = [ { @@ -150,10 +150,10 @@ async def test_train_persist_load_with_different_settings_non_windows( {"name": "DIETClassifier", MASKED_LM: True, EPOCHS: 1}, ] await _train_persist_load_with_different_settings( - pipeline, component_builder, tmpdir, False + pipeline, component_builder, tmp_path, should_finetune=False ) await _train_persist_load_with_different_settings( - pipeline, component_builder, tmpdir, True + pipeline, component_builder, tmp_path, should_finetune=True ) @@ -164,10 +164,10 @@ async def test_train_persist_load_with_different_settings(component_builder, tmp {"name": "DIETClassifier", LOSS_TYPE: "margin", EPOCHS: 1}, ] await _train_persist_load_with_different_settings( - pipeline, component_builder, tmpdir, False + pipeline, component_builder, tmpdir, should_finetune=False ) await _train_persist_load_with_different_settings( - pipeline, component_builder, tmpdir, True + pipeline, component_builder, tmpdir, should_finetune=True ) diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index 79c16aebcb6f..edf68d1ccdac 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -876,7 +876,6 @@ def test_cvf_incremental_train_vocabulary_overflow( data = TrainingData([train_message, additional_train_message]) tokenizer.train(data) - caplog.set_level(logging.WARNING) with caplog.at_level(logging.WARNING): new_featurizer.train(data) if overflow: diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index c08274f798ef..03c69229b167 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -375,8 +375,7 @@ def test_regex_featurizer_case_sensitive( assert np.allclose(sentence_features.toarray()[-1], sentence_vector, atol=1e-10) -def test_persist_load(tmp_path: Path): - +def test_persist_load_for_finetuning(tmp_path: Path): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, @@ -401,8 +400,8 @@ def test_persist_load(tmp_path: Path): # Test all artifacts stored as part of persist assert persist_value["file"] == "ftr" - assert tmp_path.joinpath("ftr.patterns.pkl").exists() - assert tmp_path.joinpath("ftr.vocabulary_stats.pkl").exists() + assert (tmp_path / "ftr.patterns.pkl").exists() + assert (tmp_path / "ftr.vocabulary_stats.pkl").exists() assert featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 3, @@ -435,7 +434,6 @@ def test_persist_load(tmp_path: Path): def test_incremental_train_featurization(tmp_path: Path): - patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, @@ -520,7 +518,6 @@ def test_incremental_train_featurization(tmp_path: Path): def test_vocabulary_overflow_log(caplog: LogCaptureFixture): - patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, @@ -539,7 +536,6 @@ def test_vocabulary_overflow_log(caplog: LogCaptureFixture): {"pattern": "\\bhello+", "name": "greet", "usage": "intent"}, ] - caplog.set_level(logging.WARNING) with caplog.at_level(logging.WARNING): featurizer.train(TrainingData([], regex_features=additional_patterns)) assert ( From 0f839037ee0e660b4949be71abccdf6458486f27 Mon Sep 17 00:00:00 2001 From: Daksh Date: Wed, 9 Dec 2020 20:15:16 +0100 Subject: [PATCH 42/45] review comments --- rasa/core/policies/policy.py | 3 ++- rasa/core/policies/ted_policy.py | 21 ++++++++++++------- .../sparse_featurizer/regex_featurizer.py | 6 +++--- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py index f0b7423b2e88..bfe3a37af921 100644 --- a/rasa/core/policies/policy.py +++ b/rasa/core/policies/policy.py @@ -113,12 +113,13 @@ def __init__( self, featurizer: Optional[TrackerFeaturizer] = None, priority: int = DEFAULT_POLICY_PRIORITY, + should_finetune: bool = False, **kwargs: Any, ) -> None: """Constructs a new Policy object.""" self.__featurizer = self._create_featurizer(featurizer) self.priority = priority - self.finetune_mode = kwargs.pop("should_finetune", False) + self.finetune_mode = should_finetune @property def featurizer(self): diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py index e786f8c23c3c..eaa725ef32ff 100644 --- a/rasa/core/policies/ted_policy.py +++ b/rasa/core/policies/ted_policy.py @@ -217,13 +217,16 @@ def __init__( max_history: Optional[int] = None, model: Optional[RasaModel] = None, zero_state_features: Optional[Dict[Text, List["Features"]]] = None, + should_finetune: bool = False, **kwargs: Any, ) -> None: """Declare instance variables with default values.""" if not featurizer: featurizer = self._standard_featurizer(max_history) - super().__init__(featurizer, priority, **kwargs) + super().__init__( + featurizer, priority, should_finetune=should_finetune, **kwargs + ) if isinstance(featurizer, FullDialogueTrackerFeaturizer): self.is_full_dialogue_featurizer_used = True else: @@ -441,7 +444,11 @@ def persist(self, path: Union[Text, Path]) -> None: @classmethod def load( - cls, path: Union[Text, Path], should_finetune: bool = False, **kwargs: Any + cls, + path: Union[Text, Path], + should_finetune: bool = False, + epoch_override: int = defaults[EPOCHS], + **kwargs: Any, ) -> "TEDPolicy": """Loads a policy from the storage. @@ -482,9 +489,7 @@ def load( ) meta = train_utils.update_similarity_type(meta) - meta["should_finetune"] = should_finetune - if "epoch_override" in kwargs: - meta[EPOCHS] = kwargs["epoch_override"] + meta[EPOCHS] = epoch_override model = TED.load( str(tf_model_file), @@ -500,14 +505,15 @@ def load( if not should_finetune: # build the graph for prediction + + features_to_select = STATE_LEVEL_FEATURES + FEATURES_TO_ENCODE + [DIALOGUE] predict_data_example = RasaModelData( label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data={ feature_name: features for feature_name, features in model_data_example.items() - if feature_name - in STATE_LEVEL_FEATURES + FEATURES_TO_ENCODE + [DIALOGUE] + if feature_name in features_to_select }, ) model.build_for_predict(predict_data_example) @@ -517,6 +523,7 @@ def load( priority=priority, model=model, zero_state_features=zero_state_features, + should_finetune=should_finetune, **meta, ) diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py index 8563caf69283..2e360fda38c0 100644 --- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py +++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py @@ -306,10 +306,10 @@ def load( """ file_name = meta.get("file") - patterns_file_name = Path(model_dir) / file_name + ".patterns.pkl" + patterns_file_name = Path(model_dir) / (file_name + ".patterns.pkl") - vocabulary_stats_file_name = ( - Path(model_dir) / file_name + ".vocabulary_stats.pkl" + vocabulary_stats_file_name = Path(model_dir) / ( + file_name + ".vocabulary_stats.pkl" ) known_patterns = None From 4c2d24911accc83d19ac68bb91e880199f3b6255 Mon Sep 17 00:00:00 2001 From: Daksh Date: Wed, 9 Dec 2020 21:33:27 +0100 Subject: [PATCH 43/45] fix tests --- .../test_count_vectors_featurizer.py | 27 +++++-------------- .../nlu/featurizers/test_regex_featurizer.py | 6 ++--- tests/test_train.py | 2 +- 3 files changed, 10 insertions(+), 25 deletions(-) diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py index edf68d1ccdac..17fc80224860 100644 --- a/tests/nlu/featurizers/test_count_vectors_featurizer.py +++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py @@ -837,22 +837,10 @@ def test_cvf_incremental_train_vocabulary( assert new_vocabulary.get(vocab_token) == vocab_index -@pytest.mark.parametrize( - "additional_size, original_train_text, additional_train_text, overflow", - [ - (10, "hello my name is John.", "I am also new.", False), - (None, "hello my name is John.", "I am also new.", False), - (3, "hello my name is John.", "I am also new.", True), - ], -) -def test_cvf_incremental_train_vocabulary_overflow( - additional_size: Optional[int], - original_train_text: Text, - additional_train_text: Text, - overflow: bool, - tmp_path: Path, - caplog: LogCaptureFixture, -): +def test_cvf_incremental_train_vocabulary_overflow(tmp_path: Path,): + additional_size = 3 + original_train_text = "hello my name is John." + additional_train_text = "I am also new." tokenizer = WhitespaceTokenizer() original_featurizer = CountVectorsFeaturizer( {"additional_vocabulary_size": {"text": additional_size}}, finetune_mode=False, @@ -876,9 +864,6 @@ def test_cvf_incremental_train_vocabulary_overflow( data = TrainingData([train_message, additional_train_message]) tokenizer.train(data) - with caplog.at_level(logging.WARNING): + with pytest.warns(UserWarning) as warning: new_featurizer.train(data) - if overflow: - assert "New data contains vocabulary of size" in caplog.text - else: - assert "New data contains vocabulary of size" not in caplog.text + assert "New data contains vocabulary of size" in warning[0].message.args[0] diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py index 03c69229b167..f7f2711de1b4 100644 --- a/tests/nlu/featurizers/test_regex_featurizer.py +++ b/tests/nlu/featurizers/test_regex_featurizer.py @@ -517,7 +517,7 @@ def test_incremental_train_featurization(tmp_path: Path): assert pattern_to_check == [new_patterns[1]] -def test_vocabulary_overflow_log(caplog: LogCaptureFixture): +def test_vocabulary_overflow_log(): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, @@ -536,9 +536,9 @@ def test_vocabulary_overflow_log(caplog: LogCaptureFixture): {"pattern": "\\bhello+", "name": "greet", "usage": "intent"}, ] - with caplog.at_level(logging.WARNING): + with pytest.warns(UserWarning) as warning: featurizer.train(TrainingData([], regex_features=additional_patterns)) assert ( "The originally trained model was configured to handle a maximum number of 4 patterns" - in caplog.text + in warning[0].message.args[0] ) diff --git a/tests/test_train.py b/tests/test_train.py index 340be1b088cd..bb8ee36964a8 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -447,7 +447,7 @@ def test_model_finetuning_core( ted = model_to_finetune.policy_ensemble.policies[0] assert ted.config[EPOCHS] == 2 - assert ted.config["should_finetune"] is True + assert ted.finetune_mode def test_model_finetuning_core_with_default_epochs( From 6e0c9b97d997b49dd96a2bf892b7edec37368349 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Thu, 10 Dec 2020 16:55:21 +0100 Subject: [PATCH 44/45] Use all training labels for fingerprinting --- rasa/model.py | 2 +- .../shared/nlu/training_data/training_data.py | 18 +++- rasa/train.py | 11 ++- tests/conftest.py | 33 ++++++- .../nlu/training_data/test_training_data.py | 25 +++++ tests/test_train.py | 98 +++++++++++++++++-- 6 files changed, 169 insertions(+), 18 deletions(-) diff --git a/rasa/model.py b/rasa/model.py index 9a5a8eb4c517..d312f1dd6544 100644 --- a/rasa/model.py +++ b/rasa/model.py @@ -370,7 +370,7 @@ def _get_fingerprint_of_config_without_epochs( copied_config = copy.deepcopy(config) for key in ["pipeline", "policies"]: - if key in copied_config and copied_config[key]: + if copied_config.get(key): for p in copied_config[key]: if "epochs" in p: del p["epochs"] diff --git a/rasa/shared/nlu/training_data/training_data.py b/rasa/shared/nlu/training_data/training_data.py index 4206fcd21847..e40c9823c17b 100644 --- a/rasa/shared/nlu/training_data/training_data.py +++ b/rasa/shared/nlu/training_data/training_data.py @@ -80,12 +80,19 @@ def fingerprint(self) -> Text: return rasa.shared.utils.io.deep_container_fingerprint(relevant_attributes) def label_fingerprint(self) -> Text: - """Fingerprint the labels in the training data. + """Fingerprints the labels in the training data. Returns: hex string as a fingerprint of the training data labels. """ - return rasa.shared.utils.io.deep_container_fingerprint(self.intents) + labels = { + "intents": sorted(self.intents), + "entities": sorted(self.entities), + "entity_groups": sorted(self.entity_groups), + "entity_roles": sorted(self.entity_roles), + "actions": sorted(self.actions), + } + return rasa.shared.utils.io.deep_container_fingerprint(labels) def merge(self, *others: Optional["TrainingData"]) -> "TrainingData": """Return merged instance of this data with other training data. @@ -189,9 +196,14 @@ def intents(self) -> Set[Text]: """Returns the set of intents in the training data.""" return {ex.get(INTENT) for ex in self.training_examples} - {None} + @lazy_property + def actions(self) -> Set[Text]: + """Returns the set of actions in the training data.""" + return {ex.get(ACTION_NAME) for ex in self.training_examples} - {None} + @lazy_property def retrieval_intents(self) -> Set[Text]: - """Returns the total number of response types in the training data""" + """Returns the total number of response types in the training data.""" return { ex.get(INTENT) for ex in self.training_examples diff --git a/rasa/train.py b/rasa/train.py index ab891358671d..4f4a0cf597c0 100644 --- a/rasa/train.py +++ b/rasa/train.py @@ -505,7 +505,7 @@ async def _core_model_for_finetuning( old_fingerprint = model.fingerprint_from_path(unpacked) if not model.can_finetune(old_fingerprint, new_fingerprint, core=True): rasa.shared.utils.cli.print_error_and_exit( - "NLU model can not be finetuned." + "Core model can not be finetuned." ) config = await file_importer.get_config() @@ -651,7 +651,7 @@ async def _train_nlu_with_validated_data( model_to_finetune, file_importer, finetuning_epoch_fraction, - also_train_core=train_path is not None, + called_from_combined_training=train_path is not None, ) if not model_to_finetune: rasa.shared.utils.cli.print_error_and_exit( @@ -697,7 +697,7 @@ async def _nlu_model_for_finetuning( model_to_finetune: Text, file_importer: TrainingDataImporter, finetuning_epoch_fraction: float = 1.0, - also_train_core: bool = False, + called_from_combined_training: bool = False, ) -> Optional[Interpreter]: path_to_archive = model.get_model_for_finetuning(model_to_finetune) @@ -709,7 +709,10 @@ async def _nlu_model_for_finetuning( new_fingerprint = await model.model_fingerprint(file_importer) old_fingerprint = model.fingerprint_from_path(unpacked) if not model.can_finetune( - old_fingerprint, new_fingerprint, nlu=True, core=also_train_core + old_fingerprint, + new_fingerprint, + nlu=True, + core=called_from_combined_training, ): rasa.shared.utils.cli.print_error_and_exit( "NLU model can not be finetuned." diff --git a/tests/conftest.py b/tests/conftest.py index 06eee24bad61..3005a2710c79 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,7 +31,7 @@ import rasa.core.run from rasa.core.tracker_store import InMemoryTrackerStore, TrackerStore from rasa.model import get_model -from rasa.train import train_async +from rasa.train import train_async, _train_nlu_async from rasa.utils.common import TempDirectoryPath from tests.core.conftest import ( DEFAULT_DOMAIN_PATH_WITH_SLOTS, @@ -100,6 +100,15 @@ async def trained_moodbot_path(trained_async: Callable) -> Text: ) +@pytest.fixture(scope="session") +async def trained_nlu_moodbot_path(trained_nlu_async: Callable) -> Text: + return await trained_nlu_async( + domain="examples/moodbot/domain.yml", + config="examples/moodbot/config.yml", + nlu_data="examples/moodbot/data/nlu.yml", + ) + + @pytest.fixture(scope="session") async def unpacked_trained_moodbot_path( trained_moodbot_path: Text, @@ -163,8 +172,13 @@ def end_to_end_story_file() -> Text: @pytest.fixture(scope="session") -def default_config() -> List[Policy]: - return config.load(DEFAULT_CONFIG_PATH) +def default_config_path() -> Text: + return DEFAULT_CONFIG_PATH + + +@pytest.fixture(scope="session") +def default_config(default_config_path) -> List[Policy]: + return config.load(default_config_path) @pytest.fixture(scope="session") @@ -180,6 +194,19 @@ async def _train( return _train +@pytest.fixture(scope="session") +def trained_nlu_async(tmpdir_factory: TempdirFactory) -> Callable: + async def _train_nlu( + *args: Any, output_path: Optional[Text] = None, **kwargs: Any + ) -> Optional[Text]: + if output_path is None: + output_path = str(tmpdir_factory.mktemp("models")) + + return await _train_nlu_async(*args, output=output_path, **kwargs) + + return _train_nlu + + @pytest.fixture(scope="session") async def trained_rasa_model( trained_async: Callable, diff --git a/tests/shared/nlu/training_data/test_training_data.py b/tests/shared/nlu/training_data/test_training_data.py index b828a37c7b84..a0acba1460d1 100644 --- a/tests/shared/nlu/training_data/test_training_data.py +++ b/tests/shared/nlu/training_data/test_training_data.py @@ -12,10 +12,13 @@ ENTITY_ATTRIBUTE_VALUE, ENTITY_ATTRIBUTE_TYPE, ENTITIES, + INTENT, + ACTION_NAME, ) from rasa.nlu.convert import convert_training_data from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer +from rasa.shared.nlu.training_data.message import Message from rasa.shared.nlu.training_data.training_data import TrainingData from rasa.shared.nlu.training_data.loading import guess_format, UNK, load_data from rasa.shared.nlu.training_data.util import ( @@ -633,3 +636,25 @@ def test_fingerprint_is_same_when_loading_data_again(): td1 = training_data_from_paths(files, language="en") td2 = training_data_from_paths(files, language="en") assert td1.fingerprint() == td2.fingerprint() + + +@pytest.mark.parametrize( + "message", + [ + Message({INTENT: "intent2"}), + Message({ENTITIES: [{"entity": "entity2"}]}), + Message({ENTITIES: [{"entity": "entity1", "group": "new_group"}]}), + Message({ENTITIES: [{"entity": "entity1", "role": "new_role"}]}), + Message({ACTION_NAME: "action_name2"}), + ], +) +def test_label_fingerprints(message: Message): + training_data1 = TrainingData( + [ + Message({INTENT: "intent1"}), + Message({ENTITIES: [{"entity": "entity1"}]}), + Message({ACTION_NAME: "action_name1"}), + ] + ) + training_data2 = training_data1.merge(TrainingData([message])) + assert training_data1.label_fingerprint() != training_data2.label_fingerprint() diff --git a/tests/test_train.py b/tests/test_train.py index a4ea7b2fee51..3ed1ae2de121 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -492,6 +492,7 @@ def test_model_finetuning_core_new_domain_label( (tmp_path / "models").mkdir() output = str(tmp_path / "models") + # Simulate addition to training data old_domain = rasa.shared.utils.io.read_yaml_file("examples/moodbot/domain.yml") old_domain["intents"].append("a_new_one") new_domain_path = tmp_path / "new_domain.yml" @@ -545,7 +546,7 @@ def test_model_finetuning_new_domain_label_stops_all_training( def test_model_finetuning_nlu( tmp_path: Path, monkeypatch: MonkeyPatch, - trained_moodbot_path: Text, + trained_nlu_moodbot_path: Text, use_latest_model: bool, ): mocked_nlu_training = AsyncMock(return_value="") @@ -561,7 +562,7 @@ def test_model_finetuning_nlu( output = str(tmp_path / "models") if use_latest_model: - trained_moodbot_path = str(Path(trained_moodbot_path).parent) + trained_nlu_moodbot_path = str(Path(trained_nlu_moodbot_path).parent) # Typically models will be fine-tuned with a smaller number of epochs than training # from scratch. @@ -581,7 +582,7 @@ def test_model_finetuning_nlu( str(new_nlu_path), domain="examples/moodbot/domain.yml", output=output, - model_to_finetune=trained_moodbot_path, + model_to_finetune=trained_nlu_moodbot_path, finetuning_epoch_fraction=0.5, ) @@ -601,7 +602,7 @@ def test_model_finetuning_nlu( def test_model_finetuning_nlu_new_label( - tmp_path: Path, monkeypatch: MonkeyPatch, trained_moodbot_path: Text, + tmp_path: Path, monkeypatch: MonkeyPatch, trained_nlu_moodbot_path: Text, ): mocked_nlu_training = AsyncMock(return_value="") monkeypatch.setattr(rasa.nlu, rasa.nlu.train.__name__, mocked_nlu_training) @@ -620,14 +621,97 @@ def test_model_finetuning_nlu_new_label( str(new_nlu_path), domain="examples/moodbot/domain.yml", output=output, - model_to_finetune=trained_moodbot_path, + model_to_finetune=trained_nlu_moodbot_path, ) mocked_nlu_training.assert_not_called() +def test_model_finetuning_nlu_new_entity( + tmp_path: Path, monkeypatch: MonkeyPatch, trained_nlu_moodbot_path: Text, +): + mocked_nlu_training = AsyncMock(return_value="") + monkeypatch.setattr(rasa.nlu, rasa.nlu.train.__name__, mocked_nlu_training) + + (tmp_path / "models").mkdir() + output = str(tmp_path / "models") + + old_nlu = rasa.shared.utils.io.read_yaml_file("examples/moodbot/data/nlu.yml") + old_nlu["nlu"][-1]["examples"] = "-[blah](something)" + new_nlu_path = tmp_path / "new_nlu.yml" + rasa.shared.utils.io.write_yaml(old_nlu, new_nlu_path) + + with pytest.raises(SystemExit): + train_nlu( + "examples/moodbot/config.yml", + str(new_nlu_path), + domain="examples/moodbot/domain.yml", + output=output, + model_to_finetune=trained_nlu_moodbot_path, + ) + + mocked_nlu_training.assert_not_called() + + +def test_model_finetuning_nlu_new_label_already_in_domain( + tmp_path: Path, + monkeypatch: MonkeyPatch, + trained_rasa_model: Text, + default_nlu_data: Text, + default_config_path: Text, + default_domain_path: Text, +): + mocked_nlu_training = AsyncMock(return_value="") + monkeypatch.setattr(rasa.nlu, rasa.nlu.train.__name__, mocked_nlu_training) + + (tmp_path / "models").mkdir() + output = str(tmp_path / "models") + + old_nlu = rasa.shared.utils.io.read_yaml_file(default_nlu_data) + # This intent exists in `default_domain_path` but not yet in the nlu data + old_nlu["nlu"].append({"intent": "why", "examples": "whyy??"}) + new_nlu_path = tmp_path / "new_nlu.yml" + rasa.shared.utils.io.write_yaml(old_nlu, new_nlu_path) + + with pytest.raises(SystemExit): + train_nlu( + default_config_path, + str(new_nlu_path), + domain=default_domain_path, + output=output, + model_to_finetune=trained_rasa_model, + ) + + mocked_nlu_training.assert_not_called() + + +def test_model_finetuning_nlu_new_label_to_domain_only( + tmp_path: Path, monkeypatch: MonkeyPatch, trained_nlu_moodbot_path: Text, +): + mocked_nlu_training = AsyncMock(return_value="") + monkeypatch.setattr(rasa.nlu, rasa.nlu.train.__name__, mocked_nlu_training) + + (tmp_path / "models").mkdir() + output = str(tmp_path / "models") + + old_domain = rasa.shared.utils.io.read_yaml_file("examples/moodbot/domain.yml") + old_domain["intents"].append("a_new_one") + new_domain_path = tmp_path / "new_domain.yml" + rasa.shared.utils.io.write_yaml(old_domain, new_domain_path) + + train_nlu( + "examples/moodbot/config.yml", + "examples/moodbot/data/nlu.yml", + domain=str(new_domain_path), + output=output, + model_to_finetune=trained_nlu_moodbot_path, + ) + + mocked_nlu_training.assert_called() + + def test_model_finetuning_nlu_with_default_epochs( - tmp_path: Path, monkeypatch: MonkeyPatch, trained_moodbot_path: Text, + tmp_path: Path, monkeypatch: MonkeyPatch, trained_nlu_moodbot_path: Text, ): mocked_nlu_training = AsyncMock(return_value="") monkeypatch.setattr(rasa.nlu, rasa.nlu.train.__name__, mocked_nlu_training) @@ -646,7 +730,7 @@ def test_model_finetuning_nlu_with_default_epochs( str(new_config_path), "examples/moodbot/data/nlu.yml", output=output, - model_to_finetune=trained_moodbot_path, + model_to_finetune=trained_nlu_moodbot_path, finetuning_epoch_fraction=0.5, ) From dd7135bff555e1be6eb037161d530c5aa74dbce2 Mon Sep 17 00:00:00 2001 From: Joseph Juzl Date: Fri, 11 Dec 2020 09:51:00 +0100 Subject: [PATCH 45/45] rename to action_names --- rasa/shared/nlu/training_data/training_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rasa/shared/nlu/training_data/training_data.py b/rasa/shared/nlu/training_data/training_data.py index e40c9823c17b..55352a410924 100644 --- a/rasa/shared/nlu/training_data/training_data.py +++ b/rasa/shared/nlu/training_data/training_data.py @@ -90,7 +90,7 @@ def label_fingerprint(self) -> Text: "entities": sorted(self.entities), "entity_groups": sorted(self.entity_groups), "entity_roles": sorted(self.entity_roles), - "actions": sorted(self.actions), + "actions": sorted(self.action_names), } return rasa.shared.utils.io.deep_container_fingerprint(labels) @@ -197,8 +197,8 @@ def intents(self) -> Set[Text]: return {ex.get(INTENT) for ex in self.training_examples} - {None} @lazy_property - def actions(self) -> Set[Text]: - """Returns the set of actions in the training data.""" + def action_names(self) -> Set[Text]: + """Returns the set of action names in the training data.""" return {ex.get(ACTION_NAME) for ex in self.training_examples} - {None} @lazy_property