Merge remote-tracking branch 'origin/master' into continuous_training

RasaHQ · Dec 14, 2020 · 977de59 · 977de59
2 parents 5ee6a58 + 3c5102e
commit 977de59
Show file tree

Hide file tree

Showing 43 changed files with 669 additions and 150 deletions.
diff --git a/.github/configs/mr-test-example.yaml b/.github/configs/mr-test-example.yaml
@@ -26,6 +26,13 @@
 ##    config: ["all"]
 ##  - dataset: ["Hermit"]
 ##    config: ["Sparse + DIET(seq) + ResponseSelector(t2t)", "BERT + DIET(seq) + ResponseSelector(t2t)"]
+#
+## Example: Define a branch name to check-out for a dataset repository. Default branch is 'master'
+## dataset_branch: "test-branch"
+## include:
+##  - dataset: ["Carbon Bot", "Sara"]
+##    config: ["all"]
+
 
 include:
   - dataset: ["Carbon Bot"]

diff --git a/.github/scripts/mr_generate_summary.py b/.github/scripts/mr_generate_summary.py
@@ -6,6 +6,7 @@
 SUMMARY_FILE = os.environ["SUMMARY_FILE"]
 CONFIG = os.environ["CONFIG"]
 DATASET = os.environ["DATASET_NAME"]
+DATASET_REPOSITORY_BRANCH = os.environ["DATASET_REPOSITORY_BRANCH"]
 task_mapping = {
     "intent_report.json": "intent_classification",
     "CRFEntityExtractor_report.json": "entity_prediction",
@@ -21,6 +22,7 @@ def generate_json(file, task, data):
         data[DATASET] = {CONFIG: {}, **data[DATASET]}
 
     data[DATASET][CONFIG] = {
+        "dataset_repository_branch": DATASET_REPOSITORY_BRANCH,
         "accelerator_type": os.environ["ACCELERATOR_TYPE"],
         "test_run_time": os.environ["TEST_RUN_TIME"],
         "train_run_time": os.environ["TRAIN_RUN_TIME"],

diff --git a/.github/scripts/mr_publish_results.py b/.github/scripts/mr_publish_results.py
@@ -27,6 +27,7 @@ def send_to_segment(context):
         "results",
         {
             "dataset": os.environ["DATASET_NAME"],
+            "dataset_repository_branch": os.environ["DATASET_REPOSITORY_BRANCH"],
             "workflow": os.environ["GITHUB_WORKFLOW"],
             "config": os.environ["CONFIG"],
             "pr_url": os.environ["PR_URL"],

diff --git a/.github/templates/model_regression_test_config_to_json.tmpl b/.github/templates/model_regression_test_config_to_json.tmpl
@@ -4,6 +4,11 @@ The template reads an issue/a PR comment and transforms a YAML code block into J
 
 */ -}}
 {{- $config := ((datasource "github").body | regexp.Find "```(?s)(.*)```" | regexp.ReplaceLiteral "```.*|\r" "" | yaml | toJSON | json) -}}
+{{- $dataset_branch := "master" -}}
+{{- /* if a branch name for dataset repository is not defined use the master branch */ -}}
+{{- if has $config "dataset_branch" -}}
+{{- $dataset_branch = $config.dataset_branch -}}
+{{- end -}}
 {"include":[
 {{- $inc := coll.Slice -}}
 {{- $dataset := coll.Slice -}}
@@ -19,10 +24,10 @@ The template reads an issue/a PR comment and transforms a YAML code block into J
 {{- /* use all available configurations if value is equal to all */ -}}
 {{- if eq $value_config "all" -}}
 {{- range $config_name, $config_file := (datasource "mapping").configurations -}}
-{{ $inc = (coll.Append (dict "dataset" $value_dataset "config" $config_name | toJSON) $inc) -}}
+{{ $inc = (coll.Append (dict "dataset_branch" $dataset_branch "dataset" $value_dataset "config" $config_name | toJSON) $inc) -}}
 {{- end -}}
 {{- else -}}
-{{ $inc = (coll.Append (dict "dataset" $value_dataset "config" $value_config | toJSON) $inc) -}}
+{{ $inc = (coll.Append (dict "dataset_branch" $dataset_branch "dataset" $value_dataset "config" $value_config | toJSON) $inc) -}}
 {{- end -}}
 {{- end -}}
 {{- end -}}

diff --git a/.github/templates/model_regression_test_results.tmpl b/.github/templates/model_regression_test_results.tmpl
@@ -37,7 +37,7 @@ Render Markdown with results.
 {{- $results_master := (datasource "results_master") -}}
 {{ range $dataset, $config := (datasource "data")}}
 {{- $dataset_master := (index $results_master $dataset) -}}
-Dataset: `{{$dataset}}`
+Dataset: `{{$dataset}}`, Dataset repository branch: `{{ (index $config (index (keys $config) 0)).dataset_repository_branch }}`
 
 | Configuration | Intent Classification Micro F1 | Entity Recognition Micro F1 | Response Selection Micro F1 |
 |---------------|-----------------|-----------------|-------------------|

diff --git a/.github/workflows/ci-model-regression-on-schedule.yml b/.github/workflows/ci-model-regression-on-schedule.yml
@@ -206,6 +206,7 @@ jobs:
           TRAIN_RUN_TIME: ${{ steps.run_test.outputs.train_run_time }}
           TOTAL_RUN_TIME: ${{ steps.run_test.outputs.total_run_time }}
           PR_URL: ""
+          DATASET_REPOSITORY_BRANCH: "master"
         run: |-
           poetry run pip install analytics-python
           poetry run python .github/scripts/mr_publish_results.py

diff --git a/.github/workflows/ci-model-regression.yml b/.github/workflows/ci-model-regression.yml
@@ -195,6 +195,7 @@ jobs:
           repository: ${{ secrets.DATASET_REPOSITORY }}
           token: ${{ secrets.ML_TEST_SA_PAT }}
           path: 'dataset'
+          ref: ${{ matrix.dataset_branch }}
 
       - name: Set DATASET and CONFIG variables
         id: set_dataset_config_vars
@@ -298,6 +299,7 @@ jobs:
           TEST_RUN_TIME: ${{ steps.run_test.outputs.test_run_time }}
           TRAIN_RUN_TIME: ${{ steps.run_test.outputs.train_run_time }}
           TOTAL_RUN_TIME: ${{ steps.run_test.outputs.total_run_time }}
+          DATASET_REPOSITORY_BRANCH: ${{ matrix.dataset_branch }}
         run: |-
           export PR_URL="https://github.com/${GITHUB_REPOSITORY}/pull/${{ github.event.number }}"
           poetry run pip install analytics-python
@@ -335,6 +337,7 @@ jobs:
           repository: ${{ secrets.DATASET_REPOSITORY }}
           token: ${{ secrets.ML_TEST_SA_PAT }}
           path: 'dataset'
+          ref: ${{ matrix.dataset_branch }}
 
       - name: Download gomplate
         run: |-
@@ -438,6 +441,7 @@ jobs:
           TEST_RUN_TIME: ${{ steps.run_test.outputs.test_run_time }}
           TRAIN_RUN_TIME: ${{ steps.run_test.outputs.train_run_time }}
           TOTAL_RUN_TIME: ${{ steps.run_test.outputs.total_run_time }}
+          DATASET_REPOSITORY_BRANCH: ${{ matrix.dataset_branch }}
         run: |-
           export PR_URL="https://github.com/${GITHUB_REPOSITORY}/pull/${{ github.event.number }}"
           poetry run pip install analytics-python

diff --git a/CHANGELOG.mdx b/CHANGELOG.mdx
@@ -16,6 +16,41 @@ https://github.com/RasaHQ/rasa/tree/master/changelog/ . -->
 
 <!-- TOWNCRIER -->
 
+## [2.1.3] - 2020-12-04
+
+
+### Improvements
+- [#7426](https://github.com/rasahq/rasa/issues/7426): Removed `multidict` from the project dependencies. `multidict` continues to be a second
+  order dependency of Rasa Open Source but will be determined by the dependencies which
+  use it instead of by Rasa Open Source directly.
+
+  This resolves issues like the following:
+
+  ```bash
+  sanic 20.9.1 has requirement multidict==5.0.0, but you'll have multidict 4.6.0 which is incompatible.
+  ```
+
+### Bugfixes
+- [#7316](https://github.com/rasahq/rasa/issues/7316): `SingleStateFeaturizer` checks whether it was trained with `RegexInterpreter` as
+  nlu interpreter. If that is the case, `RegexInterpreter` is used during prediction.
+- [#7390](https://github.com/rasahq/rasa/issues/7390): Make sure the `responses` are synced between NLU training data and the Domain even if there're no retrieval intents in the NLU training data.
+- [#7417](https://github.com/rasahq/rasa/issues/7417): Categorical slots will have a default value set when just updating nlg data in the domain.
+
+  Previously this resulted in `InvalidDomain` being thrown.
+- [#7418](https://github.com/rasahq/rasa/issues/7418): - Preserve `domain` slot ordering while dumping it back to the file.
+  - Preserve multiline `text` examples of `responses` defined in `domain` and `NLU` training data.
+
+
+## [2.1.2] - 2020-11-27
+
+
+### Bugfixes
+- [#7235](https://github.com/rasahq/rasa/issues/7235): Slots that use `initial_value` won't cause rule contradiction errors when `conversation_start: true` is used. Previously, two rules that differed only in their use of `conversation_start` would be flagged as contradicting when a slot used `initial_value`.
+
+  In checking for incomplete rules, an action will be required to have set _only_ those slots that the same action has set in another rule. Previously, an action was expected to have set also slots which, despite being present after this action in another rule, were not actually set by this action.
+- [#7345](https://github.com/rasahq/rasa/issues/7345): Fixed Rasa Open Source not being able to fetch models from certain URLs.
+
+
 ## [2.1.1] - 2020-11-23
 
 

diff --git a/changelog/6804.bugfix.md b/changelog/6804.bugfix.md
@@ -0,0 +1 @@
+Rename `language_list` to `supported_language_list` for `JiebaTokenizer`.
diff --git a/changelog/7407.bugfix.md b/changelog/7407.bugfix.md
@@ -0,0 +1 @@
+Remove token when its text (for example, whitespace) can't be tokenized by LM tokenizer (from `LanguageModelFeaturizer`).
diff --git a/changelog/7495.removal.md b/changelog/7495.removal.md
@@ -0,0 +1,15 @@
+Deprecate training and test data in Markdown format. This includes:
+- reading and writing of story files in Markdown format
+- reading and writing of NLU data in Markdown format
+- reading and writing of retrieval intent data in Markdown format
+
+Support for Markdown data will be removed entirely in Rasa Open Source 3.0.0.
+
+Please convert your existing Markdown data by using the commands
+from the [migration guide](./migration-guide.mdx#rasa-21-to-rasa-22):
+
+```bash
+rasa data convert nlu -f yaml --data={SOURCE_DIR} --out={TARGET_DIR}
+rasa data convert nlg -f yaml --data={SOURCE_DIR} --out={TARGET_DIR}
+rasa data convert core -f yaml --data={SOURCE_DIR} --out={TARGET_DIR}
+``` 
diff --git a/data/test_domains/domain_with_categorical_slot.yml b/data/test_domains/domain_with_categorical_slot.yml
@@ -0,0 +1,10 @@
+slots:
+  category_slot:
+    type: categorical
+    values:
+      - value_one
+      - value_two
+
+responses:
+  utter_greet:
+    - text: "hey there!"
diff --git a/data/test_nlg/test_responses.yml b/data/test_nlg/test_responses.yml
@@ -0,0 +1,3 @@
+responses:
+  utter_rasa:
+    - text: this is utter_rasa!
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
@@ -12,6 +12,18 @@ how you can migrate from one version to another.
 
 ## Rasa 2.1 to Rasa 2.2
 
+### Deprecations
+
+Training and test data in Markdown format is now deprecated. This includes:
+- reading and writing of story files in Markdown format
+- reading and writing of NLU data in Markdown format
+- reading and writing of retrieval intent data in Markdown format
+
+Support for Markdown data will be removed entirely in Rasa Open Source 3.0.0.
+
+Please convert your existing Markdown data by using the commands
+described [here](./migration-guide.mdx#training-data-files).
+
 ### Policies
 
 [Policies](./policies.mdx) now require a `**kwargs` argument in their constructor and `load` method.
@@ -61,7 +73,7 @@ in the [forum](https://forum.rasa.com/t/rasa-open-source-2-0-is-out-now-internal
 ### Training data files
 
 As of version 2.0, the new default training data format is yaml. Markdown is still supported,
-but this will be deprecated in a future release.
+but this will be deprecated in Rasa Open Source 3.0.0.
 
 You can convert existing NLU, Stories, and NLG (i.e. `responses.md`) training data
 files in the Markdown format to the new YAML format using following commands:

diff --git a/rasa/core/agent.py b/rasa/core/agent.py
@@ -355,9 +355,7 @@ def __init__(
         self.policy_ensemble = self._create_ensemble(policies)
 
         if self.domain is not None:
-            self.domain.add_requested_slot()
-            self.domain.add_knowledge_base_slots()
-            self.domain.add_categorical_slot_default_value()
+            self.domain.setup_slots()
 
         PolicyEnsemble.check_domain_ensemble_compatibility(
             self.policy_ensemble, self.domain

diff --git a/rasa/core/training/converters/story_markdown_to_yaml_converter.py b/rasa/core/training/converters/story_markdown_to_yaml_converter.py
@@ -41,12 +41,18 @@ async def convert_and_write(cls, source_path: Path, output_path: Path) -> None:
 
         # check if source file is test stories file
         if MarkdownStoryReader.is_test_stories_file(source_path):
-            reader = MarkdownStoryReader(is_used_for_training=False, use_e2e=True)
+            reader = MarkdownStoryReader(
+                is_used_for_training=False,
+                use_e2e=True,
+                ignore_deprecation_warning=True,
+            )
             output_core_path = cls._generate_path_for_converted_test_data_file(
                 source_path, output_path
             )
         else:
-            reader = MarkdownStoryReader(is_used_for_training=False)
+            reader = MarkdownStoryReader(
+                is_used_for_training=False, ignore_deprecation_warning=True
+            )
             output_core_path = cls.generate_path_for_converted_training_data_file(
                 source_path, output_path
             )

diff --git a/rasa/model.py b/rasa/model.py
@@ -320,7 +320,7 @@ async def model_fingerprint(file_importer: "TrainingDataImporter") -> Fingerprin
     domain = copy.copy(domain)
     # don't include the response texts in the fingerprint.
     # Their fingerprint is separate.
-    domain.templates = []
+    domain.templates = {}
 
     return {
         FINGERPRINT_CONFIG_KEY: _get_fingerprint_of_config(
@@ -580,7 +580,7 @@ async def update_model_with_new_domain(
     """
     model_path = Path(unpacked_model_path) / DEFAULT_CORE_SUBDIRECTORY_NAME
     domain = await importer.get_domain()
-
+    domain.setup_slots()
     domain.persist(model_path / DEFAULT_DOMAIN_PATH)
 
 

diff --git a/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
@@ -347,6 +347,12 @@ def _tokenize_example(
             # use lm specific tokenizer to further tokenize the text
             split_token_ids, split_token_strings = self._lm_tokenize(token.text)
 
+            if not split_token_ids:
+                # fix the situation that `token.text` only contains whitespace or other special characters,
+                # which cause `split_token_ids` and `split_token_strings` be empty,
+                # finally cause `self._lm_specific_token_cleanup()` to raise an exception
+                continue
+
             (split_token_ids, split_token_strings) = self._lm_specific_token_cleanup(
                 split_token_ids, split_token_strings
             )

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -17,8 +17,9 @@
 
 
 class JiebaTokenizer(Tokenizer):
+    """This tokenizer is a wrapper for Jieba (https://github.com/fxsjy/jieba)."""
 
-    language_list = ["zh"]
+    supported_language_list = ["zh"]
 
     defaults = {
         "dictionary_path": None,

diff --git a/rasa/nlu/training_data/converters/nlg_markdown_to_yaml_converter.py b/rasa/nlu/training_data/converters/nlg_markdown_to_yaml_converter.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Dict, Text
+from typing import Text
 
 from rasa.shared.constants import UTTER_PREFIX
 from rasa.shared.nlu.training_data.formats import NLGMarkdownReader
@@ -31,7 +31,7 @@ async def convert_and_write(cls, source_path: Path, output_path: Path) -> None:
             source_path: Path to the training data file.
             output_path: Path to the output directory.
         """
-        reader = NLGMarkdownReader()
+        reader = NLGMarkdownReader(ignore_deprecation_warning=True)
         writer = RasaYAMLWriter()
 
         output_nlg_path = cls.generate_path_for_converted_training_data_file(

diff --git a/rasa/nlu/training_data/converters/nlu_markdown_to_yaml_converter.py b/rasa/nlu/training_data/converters/nlu_markdown_to_yaml_converter.py
@@ -35,7 +35,9 @@ async def convert_and_write(cls, source_path: Path, output_path: Path) -> None:
             source_path, output_path
         )
 
-        yaml_training_data = MarkdownReader().read(source_path)
+        yaml_training_data = MarkdownReader(ignore_deprecation_warning=True).read(
+            source_path
+        )
         RasaYAMLWriter().dump(output_nlu_path, yaml_training_data)
 
         for lookup_table in yaml_training_data.lookup_tables:

diff --git a/rasa/shared/constants.py b/rasa/shared/constants.py
@@ -19,6 +19,9 @@
 DOCS_URL_TRACKER_STORES = DOCS_BASE_URL + "/tracker-stores"
 DOCS_URL_COMPONENTS = DOCS_BASE_URL + "/components"
 DOCS_URL_MIGRATION_GUIDE = DOCS_BASE_URL + "/migration-guide"
+DOCS_URL_MIGRATION_GUIDE_MD_DEPRECATION = (
+    f"{DOCS_URL_MIGRATION_GUIDE}#rasa-21-to-rasa-22"
+)
 DOCS_URL_TELEMETRY = DOCS_BASE_URL + "/telemetry/telemetry"
 DOCS_BASE_URL_RASA_X = "https://rasa.com/docs/rasa-x"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Rename `language_list` to `supported_language_list` for `JiebaTokenizer`.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Remove token when its text (for example, whitespace) can't be tokenized by LM tokenizer (from `LanguageModelFeaturizer`).