Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ingest): support writing configs to files #8696

Merged
merged 3 commits into from
Aug 28, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 38 additions & 6 deletions metadata-ingestion/src/datahub/configuration/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pathlib
import re
import sys
import tempfile
import unittest.mock
from typing import Any, Dict, Set, Union
from urllib import parse
Expand All @@ -14,7 +15,7 @@
from datahub.configuration.yaml import YamlConfigurationMechanism


def resolve_element(element: str) -> str:
def _resolve_element(element: str) -> str:
if re.search(r"(\$\{).+(\})", element):
return expandvars(element, nounset=True)
elif element.startswith("$"):
Expand All @@ -30,7 +31,7 @@ def _resolve_list(ele_list: list) -> list:
new_v: list = []
for ele in ele_list:
if isinstance(ele, str):
new_v.append(resolve_element(ele))
new_v.append(_resolve_element(ele))
elif isinstance(ele, list):
new_v.append(_resolve_list(ele))
elif isinstance(ele, dict):
Expand All @@ -48,7 +49,7 @@ def resolve_env_variables(config: dict) -> dict:
elif isinstance(v, list):
new_dict[k] = _resolve_list(v)
elif isinstance(v, str):
new_dict[k] = resolve_element(v)
new_dict[k] = _resolve_element(v)
else:
new_dict[k] = v
return new_dict
Expand All @@ -67,12 +68,40 @@ def list_referenced_env_variables(config: dict) -> Set[str]:
return set([call[1][0] for call in calls])


WRITE_TO_FILE_DIRECTIVE_PREFIX = "__DATAHUB_TO_FILE_"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this notion of "directive" a yaml thing or something we've introduce before?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's part of "datahub-flavored yaml", which is something I just made up :)

this change is mainly to make it easy to run UI ingestion - don't expect this to be a common way we do things broadly



def _process_directives(config: dict) -> dict:
def _process(obj: Any) -> Any:
if isinstance(obj, dict):
new_obj = {}
for k, v in obj.items():
if isinstance(k, str) and k.startswith(WRITE_TO_FILE_DIRECTIVE_PREFIX):
# This writes the value to a temporary file and replaces the value with the path to the file.
config_option = k[len(WRITE_TO_FILE_DIRECTIVE_PREFIX) :]

with tempfile.NamedTemporaryFile("w", delete=False) as f:
filepath = f.name
f.write(v)

new_obj[config_option] = filepath
else:
new_obj[k] = _process(v)

return new_obj
else:
return obj

return _process(config)


def load_config_file(
config_file: Union[str, pathlib.Path],
squirrel_original_config: bool = False,
squirrel_field: str = "__orig_config",
allow_stdin: bool = False,
resolve_env_vars: bool = True,
process_directives: bool = True,
) -> dict:
config_mech: ConfigurationMechanism
if allow_stdin and config_file == "-":
Expand Down Expand Up @@ -105,10 +134,13 @@ def load_config_file(

config_fp = io.StringIO(raw_config_file)
raw_config = config_mech.load_config(config_fp)

config = raw_config.copy()
if resolve_env_vars:
config = resolve_env_variables(raw_config)
else:
config = raw_config
config = resolve_env_variables(config)
if process_directives:
config = _process_directives(config)

if squirrel_original_config:
config[squirrel_field] = raw_config
return config
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ def _validate_entity_type(entity_type: str) -> None:
raise InvalidUrnError(
f"Entity type should be {DataPlatformUrn.ENTITY_TYPE} but found {entity_type}"
)

def get_platform_name(self) -> str:
return self.get_entity_id()[0]
4 changes: 4 additions & 0 deletions metadata-ingestion/tests/unit/config/basic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@ nested:
array:
- one
- two
numbers:
4: "four"
6: "six"
"8": "eight"
55 changes: 54 additions & 1 deletion metadata-ingestion/tests/unit/config/test_config_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
import pathlib
import textwrap
from unittest import mock

import deepdiff
import expandvars
import pytest
import yaml
Expand All @@ -18,7 +21,14 @@
(
# Basic YAML load
"tests/unit/config/basic.yml",
{"foo": "bar", "nested": {"array": ["one", "two"], "hi": "hello"}},
{
"foo": "bar",
"nested": {
"array": ["one", "two"],
"hi": "hello",
"numbers": {4: "four", 6: "six", "8": "eight"},
},
},
{},
set(),
),
Expand Down Expand Up @@ -165,3 +175,46 @@ def test_load_error(pytestconfig, filename, env, error_type):
with mock.patch.dict(os.environ, env):
with pytest.raises(error_type):
_ = load_config_file(filepath)


def test_write_file_directive(pytestconfig):
filepath = pytestconfig.rootpath / "tests/unit/config/write_to_file_directive.yml"

fake_ssl_key = "my-secret-key-value"

with mock.patch.dict(os.environ, {"DATAHUB_SSL_KEY": fake_ssl_key}):
loaded_config = load_config_file(filepath, squirrel_original_config=False)

# Check that the rest of the dict is unmodified.
diff = deepdiff.DeepDiff(
loaded_config,
{
"foo": "bar",
"nested": {
"hi": "hello",
"another-key": "final-value",
},
},
exclude_paths=[
"root['nested']['ssl_cert']",
"root['nested']['ssl_key']",
],
)
assert not diff

# Check that the ssl_cert was written to a file.
ssl_cert_path = loaded_config["nested"]["ssl_cert"]
assert (
pathlib.Path(ssl_cert_path).read_text()
== textwrap.dedent(
"""
-----BEGIN CERTIFICATE-----
thisisnotarealcert
-----END CERTIFICATE-----
"""
).lstrip()
)

# Check that the ssl_key was written to a file.
ssl_key_path = loaded_config["nested"]["ssl_key"]
assert pathlib.Path(ssl_key_path).read_text() == fake_ssl_key
11 changes: 11 additions & 0 deletions metadata-ingestion/tests/unit/config/write_to_file_directive.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
foo: bar
nested:
hi: hello
__DATAHUB_TO_FILE_ssl_cert: |
hsheth2 marked this conversation as resolved.
Show resolved Hide resolved
-----BEGIN CERTIFICATE-----
thisisnotarealcert
-----END CERTIFICATE-----

__DATAHUB_TO_FILE_ssl_key: ${DATAHUB_SSL_KEY}

another-key: final-value
Loading