Skip to content

Commit

Permalink
Initial implementation of converter for training data files
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Khizov committed Aug 14, 2020
1 parent b440c20 commit 622e455
Show file tree
Hide file tree
Showing 8 changed files with 205 additions and 14 deletions.
1 change: 1 addition & 0 deletions changelog/6404.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
User can use ``rasa data convert {nlu|core} -f yaml`` command to convert training data from Markdown format to YAML format.
22 changes: 22 additions & 0 deletions rasa/cli/arguments/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import argparse

from rasa.cli.arguments.default_arguments import add_model_param, add_domain_param


def set_convert_arguments(parser: argparse.ArgumentParser):
parser.add_argument(
"--training_data",
required=True,
help="Paths to the source NLU and Core data files in a Markdown format",
)

parser.add_argument(
"--output",
type=str,
required=True,
help=(
"Path to the output directory where all the converted training data files "
"will be written to. Converted files will have the same name as the "
"original ones with a '_converted' suffix."
),
)
11 changes: 7 additions & 4 deletions rasa/cli/arguments/data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
from typing import Text

from rasa.cli.arguments.default_arguments import (
add_nlu_data_param,
Expand All @@ -8,8 +9,8 @@
)


def set_convert_arguments(parser: argparse.ArgumentParser):
add_data_param(parser, required=True, default=None, data_type="Rasa NLU ")
def set_convert_arguments(parser: argparse.ArgumentParser, data_type: Text):
add_data_param(parser, required=True, default=None, data_type=data_type)

add_out_param(
parser,
Expand All @@ -24,8 +25,10 @@ def set_convert_arguments(parser: argparse.ArgumentParser):
"-f",
"--format",
required=True,
choices=["json", "md"],
help="Output format the training data should be converted into.",
choices=["json", "md", "yaml"],
help="Output format the training data should be converted into. "
"Note: currently training data can be converted to 'yaml' format "
"only from 'markdown' format",
)


Expand Down
114 changes: 111 additions & 3 deletions rasa/cli/data.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,31 @@
import logging
import argparse
import asyncio
import os
from pathlib import Path
from typing import List

from rasa import data
from rasa.cli.arguments import data as arguments
import rasa.cli.utils
from rasa.constants import DEFAULT_DATA_PATH
from rasa.core.interpreter import RegexInterpreter
from rasa.core.training.story_reader.markdown_story_reader import MarkdownStoryReader
from rasa.core.training.story_writer.yaml_story_writer import YAMLStoryWriter
from rasa.nlu.convert import convert_training_data
from rasa.nlu.training_data.formats import MarkdownReader
from rasa.nlu.training_data.formats.rasa_yaml import RasaYAMLWriter
from rasa.validator import Validator
from rasa.importers.rasa import RasaFileImporter
from rasa.cli.utils import (
print_success,
print_error_and_exit,
print_info,
print_warning,
)

logger = logging.getLogger(__name__)
CONVERTED_FILE_SUFFIX = "_converted.yml"


# noinspection PyProtectedMember
Expand Down Expand Up @@ -51,11 +66,21 @@ def _add_data_convert_parsers(
"nlu",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
parents=parents,
help="Converts NLU data between Markdown and json formats.",
help="Converts NLU data between formats.",
)
convert_nlu_parser.set_defaults(func=convert.main)
convert_nlu_parser.set_defaults(func=_convert_nlu_data)

arguments.set_convert_arguments(convert_nlu_parser)
arguments.set_convert_arguments(convert_nlu_parser, data_type="Rasa NLU")

convert_core_parser = convert_subparsers.add_parser(
"core",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
parents=parents,
help="Converts Core data between formats.",
)
convert_core_parser.set_defaults(func=_convert_core_data)

arguments.set_convert_arguments(convert_core_parser, data_type="Rasa Core")


def _add_data_split_parsers(
Expand Down Expand Up @@ -182,3 +207,86 @@ def _validate_story_structure(validator: Validator, args: argparse.Namespace) ->
return validator.verify_story_structure(
not args.fail_on_warnings, max_history=args.max_history
)


def _convert_nlu_data(args: argparse.Namespace):
if args.format in ["json", "md"]:
convert_training_data(args.data, args.out, args.format, args.language)
elif args.format == "yaml":
convert_to_yaml(args, True)
else:
print_error_and_exit(
"Did not recognize output format. Supported output formats: 'json', "
"'md', 'yaml'. Specify the desired output format with '--format'."
)


def _convert_core_data(args: argparse.Namespace):
if args.format == "yaml":
convert_to_yaml(args, False)
else:
print_error_and_exit(
"Did not recognize output format. Supported output formats: "
"'yaml'. Specify the desired output format with '--format'."
)


def convert_to_yaml(args: argparse.Namespace, is_nlu: bool) -> None:

output = Path(args.out)
if not os.path.exists(output):
print_error_and_exit(
f"The output path '{output}' doesn't exist. Please make sure to specify "
f"an existing directory and try again."
)

training_data = Path(args.data)
if not os.path.exists(training_data):
print_error_and_exit(
f"The training data path {training_data} doesn't exist "
f"and will be skipped."
)

num_of_files_converted = 0
for file in os.listdir(training_data):
source_path = training_data / file
output_path = output / f"{source_path.stem}{CONVERTED_FILE_SUFFIX}"

if MarkdownReader.is_markdown_nlu_file(source_path):
if not is_nlu:
continue
_write_nlu_yaml(source_path, output_path, source_path)
num_of_files_converted += 1
elif not is_nlu and MarkdownStoryReader.is_markdown_story_file(source_path):
_write_core_yaml(source_path, output_path, source_path)
num_of_files_converted += 1
else:
print_warning(f"Skipped file '{source_path}'")

print_info(f"Converted {num_of_files_converted} file(s), saved in '{output}'.")


def _write_nlu_yaml(
training_data_path: Path, output_path: Path, source_path: Path
) -> None:
reader = MarkdownReader()
writer = RasaYAMLWriter()

training_data = reader.read(training_data_path)
writer.dump(output_path, training_data)

print_success(f"Converted NLU file: '{source_path}' >> '{output_path}'.")


def _write_core_yaml(
training_data_path: Path, output_path: Path, source_path: Path
) -> None:
reader = MarkdownStoryReader(RegexInterpreter())
writer = YAMLStoryWriter()

loop = asyncio.get_event_loop()
steps = loop.run_until_complete(reader.read_from_file(training_data_path))

writer.dump(output_path, steps)

print_success(f"Converted Core file: '{source_path}' >> '{output_path}'.")
6 changes: 3 additions & 3 deletions rasa/core/training/story_reader/markdown_story_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import logging
import os
import re
from pathlib import PurePath
from typing import Dict, Text, List, Any
from pathlib import PurePath, Path
from typing import Dict, Text, List, Any, Union

import rasa.utils.io as io_utils
from rasa.constants import DOCS_URL_DOMAINS, DOCS_URL_STORIES
Expand All @@ -27,7 +27,7 @@ class MarkdownStoryReader(StoryReader):
"""

async def read_from_file(self, filename: Text) -> List[StoryStep]:
async def read_from_file(self, filename: Union[Text, Path]) -> List[StoryStep]:
"""Given a md file reads the contained stories."""

try:
Expand Down
8 changes: 7 additions & 1 deletion rasa/nlu/training_data/formats/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import typing
from collections import OrderedDict
from json import JSONDecodeError
from typing import Any, Text, Optional, Tuple, Dict, Match
from typing import Any, Text, Optional, Tuple, Dict

import rasa.utils.io as io_utils
from rasa.constants import DOCS_URL_TRAINING_DATA_NLU
from rasa.nlu.training_data.formats.readerwriter import (
TrainingDataReader,
Expand Down Expand Up @@ -201,6 +202,11 @@ def _set_current_section(self, section: Text, title: Text) -> None:
self.current_section = section
self.current_title = title

@staticmethod
def is_markdown_nlu_file(filename: Text) -> bool:
content = io_utils.read_file(filename)
return any(marker in content for marker in MARKDOWN_SECTION_MARKERS)


class MarkdownWriter(TrainingDataWriter):
def dumps(self, training_data: "TrainingData") -> Text:
Expand Down
5 changes: 3 additions & 2 deletions rasa/nlu/training_data/formats/readerwriter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
from collections import OrderedDict
from pathlib import Path

from rasa.core.constants import INTENT_MESSAGE_PREFIX

Expand All @@ -16,7 +17,7 @@
import rasa.utils.io as io_utils
import typing
from rasa.nlu import utils
from typing import Text, Dict, Any
from typing import Text, Dict, Any, Union

if typing.TYPE_CHECKING:
from rasa.nlu.training_data import TrainingData
Expand All @@ -26,7 +27,7 @@ class TrainingDataReader:
def __init__(self):
self.filename: Text = ""

def read(self, filename: Text, **kwargs: Any) -> "TrainingData":
def read(self, filename: Union[Text, Path], **kwargs: Any) -> "TrainingData":
"""Reads TrainingData from a file."""
self.filename = filename
return self.reads(io_utils.read_file(filename), **kwargs)
Expand Down
52 changes: 51 additions & 1 deletion tests/cli/test_rasa_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_data_convert_help(run: Callable[..., RunResult]):
output = run("data", "convert", "nlu", "--help")

help_text = """usage: rasa data convert nlu [-h] [-v] [-vv] [--quiet] --data DATA --out OUT
[-l LANGUAGE] -f {json,md}"""
[-l LANGUAGE] -f {json,md,yaml}"""

lines = help_text.split("\n")

Expand Down Expand Up @@ -112,3 +112,53 @@ def test_validate_files_exit_early():

assert pytest_e.type == SystemExit
assert pytest_e.value.code == 1


def test_rasa_data_convert_to_yaml(
run_in_simple_project: Callable[..., RunResult], run: Callable[..., RunResult]
):
converted_data_folder = "converted_data"
os.mkdir(converted_data_folder)

simple_nlu_md = """
## intent:greet
- hey
- hello
"""

with open("data/nlu.md", "w") as f:
f.write(simple_nlu_md)

simple_story_md = """
## happy path
* greet
- utter_greet
"""

with open("data/stories.md", "w") as f:
f.write(simple_story_md)

run_in_simple_project(
"data",
"convert",
"nlu",
"-f",
"yaml",
"--data",
"data",
"--out",
converted_data_folder,
)
run_in_simple_project(
"data",
"convert",
"core",
"-f",
"yaml",
"--data",
"data",
"--out",
converted_data_folder,
)

assert len(os.listdir(converted_data_folder)) == 2

0 comments on commit 622e455

Please sign in to comment.