-
Notifications
You must be signed in to change notification settings - Fork 909
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DRAFT] Dataset factory parsing rules demo #2559
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,11 @@ | ||
"""A collection of CLI commands for working with Kedro catalog.""" | ||
import copy | ||
from collections import defaultdict | ||
|
||
import click | ||
import yaml | ||
from click import secho | ||
from parse import parse | ||
|
||
from kedro.framework.cli.utils import KedroCliError, env_option, split_string | ||
from kedro.framework.project import pipelines, settings | ||
|
@@ -174,3 +176,64 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path): | |
catalog_path.parent.mkdir(exist_ok=True) | ||
with catalog_path.open(mode="w") as catalog_file: | ||
yaml.safe_dump(catalog_config, catalog_file, default_flow_style=False) | ||
|
||
|
||
def pick_best_match(matches): | ||
matches = sorted(matches, key=lambda x: (specificity(x[0]), -x[0].count("{"), x[0])) | ||
return matches[0] | ||
|
||
|
||
def specificity(pattern): | ||
"""This function will check length of exactly matched characters not inside brackets | ||
Example - | ||
specificity("{namespace}.companies") = 10 | ||
specificity("{namespace}.{dataset}") = 1 | ||
specificity("france.companies") = 16 | ||
""" | ||
pattern_variables = parse(pattern, pattern).named | ||
for k in pattern_variables: | ||
pattern_variables[k] = "" | ||
specific_characters = pattern.format(**pattern_variables) | ||
return -len(specific_characters) | ||
|
||
|
||
@catalog.command("resolve") | ||
@env_option | ||
@click.pass_obj | ||
def resolve_catalog_datasets(metadata: ProjectMetadata, env): | ||
session = _create_session(metadata.package_name, env=env) | ||
context = session.load_context() | ||
catalog_conf = context.config_loader["catalog"] | ||
|
||
# Create a list of all datasets used in the project pipelines. | ||
pipeline_datasets = [] | ||
for _, pl_obj in pipelines.items(): | ||
pipeline_ds = pl_obj.data_sets() | ||
for dataset in pipeline_ds: | ||
pipeline_datasets.append(dataset) | ||
pipeline_datasets = set(pipeline_datasets) | ||
result_catalog = {} | ||
for pipeline_dataset in pipeline_datasets: | ||
matches = [] | ||
for ds_name in catalog_conf.keys(): | ||
result = parse(ds_name, pipeline_dataset) | ||
if not result: | ||
continue | ||
# We have found a match! | ||
matches.append((ds_name, result)) | ||
if len(matches) == 0: | ||
# print(f"skipping {pipeline_dataset} -> maybe params or MemoryDataSet") | ||
continue | ||
best_match, result = pick_best_match(matches) | ||
best_match_config = copy.deepcopy(catalog_conf[best_match]) | ||
# Match results to patterns in best matching catalog entry | ||
for key, value in best_match_config.items(): | ||
string_value = str(value) | ||
try: | ||
formatted_string = string_value.format_map(result.named) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should only happen for dataset fields with
But that's not really part of the matching logic, so not super important here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be changed to only match with patterns containing So the exact match then gets chosen as the best match and since it wouldn't contain any brackets anywhere in the catalog entry, it would not be changed at all. |
||
except KeyError: | ||
# Dataset config has a placeholder which is not present in the ds name | ||
print(f"'{key}' has invalid catalog configuration") | ||
best_match_config[key] = formatted_string | ||
result_catalog[pipeline_dataset] = best_match_config | ||
secho(yaml.dump(result_catalog)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this method just counting the number of characters that are not in brackets? Wouldn't it be easier to just remove the text in {} and then get the length of the string?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is what is happening here but using
parse
instead of a regex.