Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into continuous_training
Browse files Browse the repository at this point in the history
  • Loading branch information
joejuzl committed Dec 14, 2020
2 parents 5ee6a58 + 3c5102e commit 977de59
Show file tree
Hide file tree
Showing 43 changed files with 669 additions and 150 deletions.
7 changes: 7 additions & 0 deletions .github/configs/mr-test-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@
## config: ["all"]
## - dataset: ["Hermit"]
## config: ["Sparse + DIET(seq) + ResponseSelector(t2t)", "BERT + DIET(seq) + ResponseSelector(t2t)"]
#
## Example: Define a branch name to check-out for a dataset repository. Default branch is 'master'
## dataset_branch: "test-branch"
## include:
## - dataset: ["Carbon Bot", "Sara"]
## config: ["all"]


include:
- dataset: ["Carbon Bot"]
Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/mr_generate_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
SUMMARY_FILE = os.environ["SUMMARY_FILE"]
CONFIG = os.environ["CONFIG"]
DATASET = os.environ["DATASET_NAME"]
DATASET_REPOSITORY_BRANCH = os.environ["DATASET_REPOSITORY_BRANCH"]
task_mapping = {
"intent_report.json": "intent_classification",
"CRFEntityExtractor_report.json": "entity_prediction",
Expand All @@ -21,6 +22,7 @@ def generate_json(file, task, data):
data[DATASET] = {CONFIG: {}, **data[DATASET]}

data[DATASET][CONFIG] = {
"dataset_repository_branch": DATASET_REPOSITORY_BRANCH,
"accelerator_type": os.environ["ACCELERATOR_TYPE"],
"test_run_time": os.environ["TEST_RUN_TIME"],
"train_run_time": os.environ["TRAIN_RUN_TIME"],
Expand Down
1 change: 1 addition & 0 deletions .github/scripts/mr_publish_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def send_to_segment(context):
"results",
{
"dataset": os.environ["DATASET_NAME"],
"dataset_repository_branch": os.environ["DATASET_REPOSITORY_BRANCH"],
"workflow": os.environ["GITHUB_WORKFLOW"],
"config": os.environ["CONFIG"],
"pr_url": os.environ["PR_URL"],
Expand Down
9 changes: 7 additions & 2 deletions .github/templates/model_regression_test_config_to_json.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ The template reads an issue/a PR comment and transforms a YAML code block into J

*/ -}}
{{- $config := ((datasource "github").body | regexp.Find "```(?s)(.*)```" | regexp.ReplaceLiteral "```.*|\r" "" | yaml | toJSON | json) -}}
{{- $dataset_branch := "master" -}}
{{- /* if a branch name for dataset repository is not defined use the master branch */ -}}
{{- if has $config "dataset_branch" -}}
{{- $dataset_branch = $config.dataset_branch -}}
{{- end -}}
{"include":[
{{- $inc := coll.Slice -}}
{{- $dataset := coll.Slice -}}
Expand All @@ -19,10 +24,10 @@ The template reads an issue/a PR comment and transforms a YAML code block into J
{{- /* use all available configurations if value is equal to all */ -}}
{{- if eq $value_config "all" -}}
{{- range $config_name, $config_file := (datasource "mapping").configurations -}}
{{ $inc = (coll.Append (dict "dataset" $value_dataset "config" $config_name | toJSON) $inc) -}}
{{ $inc = (coll.Append (dict "dataset_branch" $dataset_branch "dataset" $value_dataset "config" $config_name | toJSON) $inc) -}}
{{- end -}}
{{- else -}}
{{ $inc = (coll.Append (dict "dataset" $value_dataset "config" $value_config | toJSON) $inc) -}}
{{ $inc = (coll.Append (dict "dataset_branch" $dataset_branch "dataset" $value_dataset "config" $value_config | toJSON) $inc) -}}
{{- end -}}
{{- end -}}
{{- end -}}
Expand Down
2 changes: 1 addition & 1 deletion .github/templates/model_regression_test_results.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Render Markdown with results.
{{- $results_master := (datasource "results_master") -}}
{{ range $dataset, $config := (datasource "data")}}
{{- $dataset_master := (index $results_master $dataset) -}}
Dataset: `{{$dataset}}`
Dataset: `{{$dataset}}`, Dataset repository branch: `{{ (index $config (index (keys $config) 0)).dataset_repository_branch }}`

| Configuration | Intent Classification Micro F1 | Entity Recognition Micro F1 | Response Selection Micro F1 |
|---------------|-----------------|-----------------|-------------------|
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/ci-model-regression-on-schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ jobs:
TRAIN_RUN_TIME: ${{ steps.run_test.outputs.train_run_time }}
TOTAL_RUN_TIME: ${{ steps.run_test.outputs.total_run_time }}
PR_URL: ""
DATASET_REPOSITORY_BRANCH: "master"
run: |-
poetry run pip install analytics-python
poetry run python .github/scripts/mr_publish_results.py
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/ci-model-regression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ jobs:
repository: ${{ secrets.DATASET_REPOSITORY }}
token: ${{ secrets.ML_TEST_SA_PAT }}
path: 'dataset'
ref: ${{ matrix.dataset_branch }}

- name: Set DATASET and CONFIG variables
id: set_dataset_config_vars
Expand Down Expand Up @@ -298,6 +299,7 @@ jobs:
TEST_RUN_TIME: ${{ steps.run_test.outputs.test_run_time }}
TRAIN_RUN_TIME: ${{ steps.run_test.outputs.train_run_time }}
TOTAL_RUN_TIME: ${{ steps.run_test.outputs.total_run_time }}
DATASET_REPOSITORY_BRANCH: ${{ matrix.dataset_branch }}
run: |-
export PR_URL="https://github.com/${GITHUB_REPOSITORY}/pull/${{ github.event.number }}"
poetry run pip install analytics-python
Expand Down Expand Up @@ -335,6 +337,7 @@ jobs:
repository: ${{ secrets.DATASET_REPOSITORY }}
token: ${{ secrets.ML_TEST_SA_PAT }}
path: 'dataset'
ref: ${{ matrix.dataset_branch }}

- name: Download gomplate
run: |-
Expand Down Expand Up @@ -438,6 +441,7 @@ jobs:
TEST_RUN_TIME: ${{ steps.run_test.outputs.test_run_time }}
TRAIN_RUN_TIME: ${{ steps.run_test.outputs.train_run_time }}
TOTAL_RUN_TIME: ${{ steps.run_test.outputs.total_run_time }}
DATASET_REPOSITORY_BRANCH: ${{ matrix.dataset_branch }}
run: |-
export PR_URL="https://github.com/${GITHUB_REPOSITORY}/pull/${{ github.event.number }}"
poetry run pip install analytics-python
Expand Down
35 changes: 35 additions & 0 deletions CHANGELOG.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,41 @@ https://github.com/RasaHQ/rasa/tree/master/changelog/ . -->

<!-- TOWNCRIER -->

## [2.1.3] - 2020-12-04


### Improvements
- [#7426](https://github.com/rasahq/rasa/issues/7426): Removed `multidict` from the project dependencies. `multidict` continues to be a second
order dependency of Rasa Open Source but will be determined by the dependencies which
use it instead of by Rasa Open Source directly.

This resolves issues like the following:

```bash
sanic 20.9.1 has requirement multidict==5.0.0, but you'll have multidict 4.6.0 which is incompatible.
```

### Bugfixes
- [#7316](https://github.com/rasahq/rasa/issues/7316): `SingleStateFeaturizer` checks whether it was trained with `RegexInterpreter` as
nlu interpreter. If that is the case, `RegexInterpreter` is used during prediction.
- [#7390](https://github.com/rasahq/rasa/issues/7390): Make sure the `responses` are synced between NLU training data and the Domain even if there're no retrieval intents in the NLU training data.
- [#7417](https://github.com/rasahq/rasa/issues/7417): Categorical slots will have a default value set when just updating nlg data in the domain.

Previously this resulted in `InvalidDomain` being thrown.
- [#7418](https://github.com/rasahq/rasa/issues/7418): - Preserve `domain` slot ordering while dumping it back to the file.
- Preserve multiline `text` examples of `responses` defined in `domain` and `NLU` training data.


## [2.1.2] - 2020-11-27


### Bugfixes
- [#7235](https://github.com/rasahq/rasa/issues/7235): Slots that use `initial_value` won't cause rule contradiction errors when `conversation_start: true` is used. Previously, two rules that differed only in their use of `conversation_start` would be flagged as contradicting when a slot used `initial_value`.

In checking for incomplete rules, an action will be required to have set _only_ those slots that the same action has set in another rule. Previously, an action was expected to have set also slots which, despite being present after this action in another rule, were not actually set by this action.
- [#7345](https://github.com/rasahq/rasa/issues/7345): Fixed Rasa Open Source not being able to fetch models from certain URLs.


## [2.1.1] - 2020-11-23


Expand Down
1 change: 1 addition & 0 deletions changelog/6804.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Rename `language_list` to `supported_language_list` for `JiebaTokenizer`.
1 change: 1 addition & 0 deletions changelog/7407.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove token when its text (for example, whitespace) can't be tokenized by LM tokenizer (from `LanguageModelFeaturizer`).
15 changes: 15 additions & 0 deletions changelog/7495.removal.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Deprecate training and test data in Markdown format. This includes:
- reading and writing of story files in Markdown format
- reading and writing of NLU data in Markdown format
- reading and writing of retrieval intent data in Markdown format

Support for Markdown data will be removed entirely in Rasa Open Source 3.0.0.

Please convert your existing Markdown data by using the commands
from the [migration guide](./migration-guide.mdx#rasa-21-to-rasa-22):

```bash
rasa data convert nlu -f yaml --data={SOURCE_DIR} --out={TARGET_DIR}
rasa data convert nlg -f yaml --data={SOURCE_DIR} --out={TARGET_DIR}
rasa data convert core -f yaml --data={SOURCE_DIR} --out={TARGET_DIR}
```
10 changes: 10 additions & 0 deletions data/test_domains/domain_with_categorical_slot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
slots:
category_slot:
type: categorical
values:
- value_one
- value_two

responses:
utter_greet:
- text: "hey there!"
3 changes: 3 additions & 0 deletions data/test_nlg/test_responses.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
responses:
utter_rasa:
- text: this is utter_rasa!
14 changes: 13 additions & 1 deletion docs/docs/migration-guide.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,18 @@ how you can migrate from one version to another.

## Rasa 2.1 to Rasa 2.2

### Deprecations

Training and test data in Markdown format is now deprecated. This includes:
- reading and writing of story files in Markdown format
- reading and writing of NLU data in Markdown format
- reading and writing of retrieval intent data in Markdown format

Support for Markdown data will be removed entirely in Rasa Open Source 3.0.0.

Please convert your existing Markdown data by using the commands
described [here](./migration-guide.mdx#training-data-files).

### Policies

[Policies](./policies.mdx) now require a `**kwargs` argument in their constructor and `load` method.
Expand Down Expand Up @@ -61,7 +73,7 @@ in the [forum](https://forum.rasa.com/t/rasa-open-source-2-0-is-out-now-internal
### Training data files

As of version 2.0, the new default training data format is yaml. Markdown is still supported,
but this will be deprecated in a future release.
but this will be deprecated in Rasa Open Source 3.0.0.

You can convert existing NLU, Stories, and NLG (i.e. `responses.md`) training data
files in the Markdown format to the new YAML format using following commands:
Expand Down
4 changes: 1 addition & 3 deletions rasa/core/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,7 @@ def __init__(
self.policy_ensemble = self._create_ensemble(policies)

if self.domain is not None:
self.domain.add_requested_slot()
self.domain.add_knowledge_base_slots()
self.domain.add_categorical_slot_default_value()
self.domain.setup_slots()

PolicyEnsemble.check_domain_ensemble_compatibility(
self.policy_ensemble, self.domain
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,18 @@ async def convert_and_write(cls, source_path: Path, output_path: Path) -> None:

# check if source file is test stories file
if MarkdownStoryReader.is_test_stories_file(source_path):
reader = MarkdownStoryReader(is_used_for_training=False, use_e2e=True)
reader = MarkdownStoryReader(
is_used_for_training=False,
use_e2e=True,
ignore_deprecation_warning=True,
)
output_core_path = cls._generate_path_for_converted_test_data_file(
source_path, output_path
)
else:
reader = MarkdownStoryReader(is_used_for_training=False)
reader = MarkdownStoryReader(
is_used_for_training=False, ignore_deprecation_warning=True
)
output_core_path = cls.generate_path_for_converted_training_data_file(
source_path, output_path
)
Expand Down
4 changes: 2 additions & 2 deletions rasa/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ async def model_fingerprint(file_importer: "TrainingDataImporter") -> Fingerprin
domain = copy.copy(domain)
# don't include the response texts in the fingerprint.
# Their fingerprint is separate.
domain.templates = []
domain.templates = {}

return {
FINGERPRINT_CONFIG_KEY: _get_fingerprint_of_config(
Expand Down Expand Up @@ -580,7 +580,7 @@ async def update_model_with_new_domain(
"""
model_path = Path(unpacked_model_path) / DEFAULT_CORE_SUBDIRECTORY_NAME
domain = await importer.get_domain()

domain.setup_slots()
domain.persist(model_path / DEFAULT_DOMAIN_PATH)


Expand Down
6 changes: 6 additions & 0 deletions rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,12 @@ def _tokenize_example(
# use lm specific tokenizer to further tokenize the text
split_token_ids, split_token_strings = self._lm_tokenize(token.text)

if not split_token_ids:
# fix the situation that `token.text` only contains whitespace or other special characters,
# which cause `split_token_ids` and `split_token_strings` be empty,
# finally cause `self._lm_specific_token_cleanup()` to raise an exception
continue

(split_token_ids, split_token_strings) = self._lm_specific_token_cleanup(
split_token_ids, split_token_strings
)
Expand Down
3 changes: 2 additions & 1 deletion rasa/nlu/tokenizers/jieba_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@


class JiebaTokenizer(Tokenizer):
"""This tokenizer is a wrapper for Jieba (https://github.com/fxsjy/jieba)."""

language_list = ["zh"]
supported_language_list = ["zh"]

defaults = {
"dictionary_path": None,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from typing import Dict, Text
from typing import Text

from rasa.shared.constants import UTTER_PREFIX
from rasa.shared.nlu.training_data.formats import NLGMarkdownReader
Expand Down Expand Up @@ -31,7 +31,7 @@ async def convert_and_write(cls, source_path: Path, output_path: Path) -> None:
source_path: Path to the training data file.
output_path: Path to the output directory.
"""
reader = NLGMarkdownReader()
reader = NLGMarkdownReader(ignore_deprecation_warning=True)
writer = RasaYAMLWriter()

output_nlg_path = cls.generate_path_for_converted_training_data_file(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ async def convert_and_write(cls, source_path: Path, output_path: Path) -> None:
source_path, output_path
)

yaml_training_data = MarkdownReader().read(source_path)
yaml_training_data = MarkdownReader(ignore_deprecation_warning=True).read(
source_path
)
RasaYAMLWriter().dump(output_nlu_path, yaml_training_data)

for lookup_table in yaml_training_data.lookup_tables:
Expand Down
3 changes: 3 additions & 0 deletions rasa/shared/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
DOCS_URL_TRACKER_STORES = DOCS_BASE_URL + "/tracker-stores"
DOCS_URL_COMPONENTS = DOCS_BASE_URL + "/components"
DOCS_URL_MIGRATION_GUIDE = DOCS_BASE_URL + "/migration-guide"
DOCS_URL_MIGRATION_GUIDE_MD_DEPRECATION = (
f"{DOCS_URL_MIGRATION_GUIDE}#rasa-21-to-rasa-22"
)
DOCS_URL_TELEMETRY = DOCS_BASE_URL + "/telemetry/telemetry"
DOCS_BASE_URL_RASA_X = "https://rasa.com/docs/rasa-x"

Expand Down
Loading

0 comments on commit 977de59

Please sign in to comment.