From 618a7778f1e15a7ee1c2d2b74e4c017105f01f9d Mon Sep 17 00:00:00 2001 From: Ada Draginda Date: Tue, 21 Feb 2023 13:09:08 -0800 Subject: [PATCH] do not return schema data by default (#8) --- CHANGE_LOG.md | 21 ++++++ pyproject.toml | 2 +- src/datahub_tools/client.py | 139 +++++++++++++++++------------------- src/datahub_tools/utils.py | 33 --------- 4 files changed, 88 insertions(+), 107 deletions(-) delete mode 100644 src/datahub_tools/utils.py diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index 5a934d4..91db5c2 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -1,5 +1,26 @@ # Change Log +### v1.0.0 - 2023-02-21 Ada Draginda +#### Deprecations +* `extract_dbt_resources` has moved from a soft to a hard deprecation. Instead, use +`datahub_tools.dbt.extract_dbt_resources` +* `client.update_descriptions` has moved from a soft to a hard deprecation. Instead, use +`update_field_descriptions` or `update_dataset_description` + +#### Changes +* DataHub posts are are now logged with fewer linebreaks and repeated spaces +* `client.get_datahub_entities` no longer returns schema data, by default. You can turn this feature +back on with the `with_schema` argument. This change was made for performance reasons. + +### v0.4.0 - 2023-02-09 Ada Draginda +#### Changes +* Added new `get_owners` to the client module + +### v0.3.0 - 2023-02-07 Ada Draginda +#### Changes +* Added a example on how to use transformers +* New DBT module for fetching DBT dependency lineage + ### v0.2.0 - 2023-01-31 Ada Draginda #### Deprecations * `client.update_description` has been deprecated in favor of `client.update_field_descriptions` diff --git a/pyproject.toml b/pyproject.toml index 3dfba3f..6f8ed0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = 'setuptools.build_meta' [project] name = 'datahub_tools' description = 'Python tools for working with DataHub' -version = '0.4.1' +version = '1.0.0' readme = 'README.md' requires-python = '>=3.7' dependencies = [ diff --git a/src/datahub_tools/client.py b/src/datahub_tools/client.py index f5cb7b8..8353ab9 100644 --- a/src/datahub_tools/client.py +++ b/src/datahub_tools/client.py @@ -4,7 +4,6 @@ import logging import os import re -import warnings from string import Template from textwrap import dedent from typing import Dict, Iterable, List @@ -53,7 +52,9 @@ def datahub_post(body: Dict) -> Dict: graphql_url = get_dh_graphql_url() logger = logging.getLogger(__name__) # the sub just condenses down the body e.g.: 'query: \n {...' -> 'query: {...' - logger.info("posting to %s: %s", graphql_url, re.sub(r"\n+\s*", "", str(body))) + # Note that the extra backslash is needed (\\n+) because body is a dict and calling + # str will inject additional escape characters. + logger.info("posting to %s: %s", graphql_url, re.sub(r"\\n+\s*", "", str(body))) response = requests.post(url=graphql_url, headers=headers, json=body) response.raise_for_status() @@ -119,10 +120,16 @@ def emit_metadata( emitter.emit(metadata_event) -def get_datahub_entities(start: int = 0, limit: int = 10000) -> List[DHEntity]: +def get_datahub_entities( + start: int = 0, limit: int = 10000, with_schema: bool = False +) -> List[DHEntity]: """ :param start: Index of the first record to return - :param limit: Maximum number of records to return (default and maximum per query is 10000). + :param limit: Maximum number of records to return (default and maximum per query + is 10000). + :param with_schema: If True (default is False) then schema fields and descriptions + will be retrieved (warning: may be slow or cause the DataHub endpoint to 503, in + which case you will need to retrieve your entities in chunks). :return: Dictionary of snowflake name (e.g. prep.core.calendar) to DataHub urn e.g. urn:li:dataset:(urn:li:dataPlatform:snowflake,prep.core.calendar,PROD) """ @@ -158,28 +165,9 @@ def get_datahub_entities(start: int = 0, limit: int = 10000) -> List[DHEntity]: """ ) - query_body = Template( - dedent( - """{ - search(input: {type: DATASET, query: "*", start:$start, count: $limit}) - { - start - count - searchResults { - entity { - urn - type - ...on Dataset { - name - properties { - qualifiedName - description - customProperties { - ...on CustomPropertiesEntry { key value } - } - } - editableProperties { description } - schemaMetadata { + schema_query = dedent( + """ + schemaMetadata { fields { ...on SchemaField { $field_vars @@ -194,22 +182,50 @@ def get_datahub_entities(start: int = 0, limit: int = 10000) -> List[DHEntity]: } } } - ownership { - owners { - owner { - ...on CorpUser { urn } - ...on CorpGroup { urn } + """ + ) + + query_body_template = Template( + dedent( + """ + { + search(input: {type: DATASET, query: "*", start:$start, count: $limit}) + { + start + count + searchResults { + entity { + urn + type + ...on Dataset { + name + properties { + qualifiedName + description + customProperties { + ...on CustomPropertiesEntry { key value } + } } - type - } - } - tags { - tags { - tag { - ...on Tag { - urn - properties { - ...on TagProperties { name } + editableProperties { description } + $schema_query + ownership { + owners { + owner { + ...on CorpUser { urn } + ...on CorpGroup { urn } + } + type + } + } + tags { + tags { + tag { + ...on Tag { + urn + properties { + ...on TagProperties { name } + } + } } } } @@ -218,10 +234,16 @@ def get_datahub_entities(start: int = 0, limit: int = 10000) -> List[DHEntity]: } } } - } - }""" + """ ) - ).substitute(field_vars=field_vars, start=start, limit=max(limit, 10000)) + ) + + query_body = query_body_template.substitute( + field_vars=field_vars, + start=start, + limit=max(limit, 10000), + schema_query=schema_query if with_schema else "", + ) body = {"query": query_body, "variables": {}} dh_entities = datahub_post(body=body)["data"]["search"]["searchResults"] @@ -387,35 +409,6 @@ def update_institutional_memory( return response[endpoint] -def update_description( - resource_urn: str, description: str, column: str | None = None -) -> bool: - """ - :param resource_urn: The entity/resource URN to update - :param description: The new description - :param column: If left out, then the description is updated for the entity. If provided, then the - description will be applied to this column (field). - :return: - """ - warnings.warn( - "update_description is deprecated and will be removed in the next release, " - "please use update_field_descriptions or update_dataset_description.", - DeprecationWarning, - ) - subresource = ( - f', subResourceType: DATASET_FIELD, subResource: "{column}"' if column else "" - ) - body = { - "query": ( - "mutation updateDescription { updateDescription(input: {" - f'resourceUrn: "{resource_urn}", description: "{description}"{subresource}' - "}) }" - ), - "variables": {}, - } - return datahub_post(body=body)["data"]["updateDescription"] - - def set_group_owner( group_urn: str, resource_urns: List[str], owner_type: str = TECHNICAL_OWNER ): diff --git a/src/datahub_tools/utils.py b/src/datahub_tools/utils.py deleted file mode 100644 index a38da82..0000000 --- a/src/datahub_tools/utils.py +++ /dev/null @@ -1,33 +0,0 @@ -from __future__ import annotations - -import json -import warnings -from pathlib import Path -from typing import Any, Dict, Iterable - - -def extract_dbt_resources( - manifest_file: str | Path, resource_type_filter: Iterable[str] | None = None -) -> Dict[str, Dict[str, Any]]: - """ - :param manifest_file manifest file generated by dbt (e.g. manifest.json) - :param resource_type_filter An optional resource type filter that will be applied before the resources - are returned. For example ['snapshot', 'model'] - :return: A dictionary containing the snowflake table name (e.g. prep.core.calendar) - and the associated dbt manifest dict (table metadata). - """ - warnings.warn( - "deprecated, please use datahub_tools.dbt.extract_dbt_resources", - DeprecationWarning, - ) - if isinstance(manifest_file, str): - manifest_file = Path(manifest_file) - with manifest_file.open() as f: - manifest = json.load(f) - manifest_nodes = manifest["nodes"] - - return { - unique_id: data - for unique_id, data in manifest_nodes.items() - if (not resource_type_filter or data["resource_type"] in resource_type_filter) - }