diff --git a/metadata-ingestion/src/datahub/configuration/config_loader.py b/metadata-ingestion/src/datahub/configuration/config_loader.py index 78bee21d1bda4f..30ca4ff6aed2d1 100644 --- a/metadata-ingestion/src/datahub/configuration/config_loader.py +++ b/metadata-ingestion/src/datahub/configuration/config_loader.py @@ -2,6 +2,7 @@ import pathlib import re import sys +import tempfile import unittest.mock from typing import Any, Dict, Set, Union from urllib import parse @@ -14,7 +15,7 @@ from datahub.configuration.yaml import YamlConfigurationMechanism -def resolve_element(element: str) -> str: +def _resolve_element(element: str) -> str: if re.search(r"(\$\{).+(\})", element): return expandvars(element, nounset=True) elif element.startswith("$"): @@ -30,7 +31,7 @@ def _resolve_list(ele_list: list) -> list: new_v: list = [] for ele in ele_list: if isinstance(ele, str): - new_v.append(resolve_element(ele)) + new_v.append(_resolve_element(ele)) elif isinstance(ele, list): new_v.append(_resolve_list(ele)) elif isinstance(ele, dict): @@ -48,7 +49,7 @@ def resolve_env_variables(config: dict) -> dict: elif isinstance(v, list): new_dict[k] = _resolve_list(v) elif isinstance(v, str): - new_dict[k] = resolve_element(v) + new_dict[k] = _resolve_element(v) else: new_dict[k] = v return new_dict @@ -67,12 +68,40 @@ def list_referenced_env_variables(config: dict) -> Set[str]: return set([call[1][0] for call in calls]) +WRITE_TO_FILE_DIRECTIVE_PREFIX = "__DATAHUB_TO_FILE_" + + +def _process_directives(config: dict) -> dict: + def _process(obj: Any) -> Any: + if isinstance(obj, dict): + new_obj = {} + for k, v in obj.items(): + if isinstance(k, str) and k.startswith(WRITE_TO_FILE_DIRECTIVE_PREFIX): + # This writes the value to a temporary file and replaces the value with the path to the file. + config_option = k[len(WRITE_TO_FILE_DIRECTIVE_PREFIX) :] + + with tempfile.NamedTemporaryFile("w", delete=False) as f: + filepath = f.name + f.write(v) + + new_obj[config_option] = filepath + else: + new_obj[k] = _process(v) + + return new_obj + else: + return obj + + return _process(config) + + def load_config_file( config_file: Union[str, pathlib.Path], squirrel_original_config: bool = False, squirrel_field: str = "__orig_config", allow_stdin: bool = False, resolve_env_vars: bool = True, + process_directives: bool = True, ) -> dict: config_mech: ConfigurationMechanism if allow_stdin and config_file == "-": @@ -105,10 +134,13 @@ def load_config_file( config_fp = io.StringIO(raw_config_file) raw_config = config_mech.load_config(config_fp) + + config = raw_config.copy() if resolve_env_vars: - config = resolve_env_variables(raw_config) - else: - config = raw_config + config = resolve_env_variables(config) + if process_directives: + config = _process_directives(config) + if squirrel_original_config: config[squirrel_field] = raw_config return config diff --git a/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py b/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py index 8732592b800201..79cf54dfe920a0 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/data_platform_urn.py @@ -29,3 +29,6 @@ def _validate_entity_type(entity_type: str) -> None: raise InvalidUrnError( f"Entity type should be {DataPlatformUrn.ENTITY_TYPE} but found {entity_type}" ) + + def get_platform_name(self) -> str: + return self.get_entity_id()[0] diff --git a/metadata-ingestion/tests/unit/config/basic.yml b/metadata-ingestion/tests/unit/config/basic.yml index cc5372a05d84a7..ce9f3b3f8cf94d 100644 --- a/metadata-ingestion/tests/unit/config/basic.yml +++ b/metadata-ingestion/tests/unit/config/basic.yml @@ -5,3 +5,7 @@ nested: array: - one - two + numbers: + 4: "four" + 6: "six" + "8": "eight" diff --git a/metadata-ingestion/tests/unit/config/test_config_loader.py b/metadata-ingestion/tests/unit/config/test_config_loader.py index e29aa3b0b582c3..3253c96b876aa9 100644 --- a/metadata-ingestion/tests/unit/config/test_config_loader.py +++ b/metadata-ingestion/tests/unit/config/test_config_loader.py @@ -1,6 +1,9 @@ import os +import pathlib +import textwrap from unittest import mock +import deepdiff import expandvars import pytest import yaml @@ -18,7 +21,14 @@ ( # Basic YAML load "tests/unit/config/basic.yml", - {"foo": "bar", "nested": {"array": ["one", "two"], "hi": "hello"}}, + { + "foo": "bar", + "nested": { + "array": ["one", "two"], + "hi": "hello", + "numbers": {4: "four", 6: "six", "8": "eight"}, + }, + }, {}, set(), ), @@ -165,3 +175,46 @@ def test_load_error(pytestconfig, filename, env, error_type): with mock.patch.dict(os.environ, env): with pytest.raises(error_type): _ = load_config_file(filepath) + + +def test_write_file_directive(pytestconfig): + filepath = pytestconfig.rootpath / "tests/unit/config/write_to_file_directive.yml" + + fake_ssl_key = "my-secret-key-value" + + with mock.patch.dict(os.environ, {"DATAHUB_SSL_KEY": fake_ssl_key}): + loaded_config = load_config_file(filepath, squirrel_original_config=False) + + # Check that the rest of the dict is unmodified. + diff = deepdiff.DeepDiff( + loaded_config, + { + "foo": "bar", + "nested": { + "hi": "hello", + "another-key": "final-value", + }, + }, + exclude_paths=[ + "root['nested']['ssl_cert']", + "root['nested']['ssl_key']", + ], + ) + assert not diff + + # Check that the ssl_cert was written to a file. + ssl_cert_path = loaded_config["nested"]["ssl_cert"] + assert ( + pathlib.Path(ssl_cert_path).read_text() + == textwrap.dedent( + """ + -----BEGIN CERTIFICATE----- + thisisnotarealcert + -----END CERTIFICATE----- + """ + ).lstrip() + ) + + # Check that the ssl_key was written to a file. + ssl_key_path = loaded_config["nested"]["ssl_key"] + assert pathlib.Path(ssl_key_path).read_text() == fake_ssl_key diff --git a/metadata-ingestion/tests/unit/config/write_to_file_directive.yml b/metadata-ingestion/tests/unit/config/write_to_file_directive.yml new file mode 100644 index 00000000000000..e47f192096309d --- /dev/null +++ b/metadata-ingestion/tests/unit/config/write_to_file_directive.yml @@ -0,0 +1,11 @@ +foo: bar +nested: + hi: hello + __DATAHUB_TO_FILE_ssl_cert: | + -----BEGIN CERTIFICATE----- + thisisnotarealcert + -----END CERTIFICATE----- + + __DATAHUB_TO_FILE_ssl_key: ${DATAHUB_SSL_KEY} + + another-key: final-value