Skip to content

Commit

Permalink
do not return schema data by default (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ada Draginda authored Feb 21, 2023
1 parent a5d87a8 commit 618a777
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 107 deletions.
21 changes: 21 additions & 0 deletions CHANGE_LOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,26 @@
# Change Log

### v1.0.0 - 2023-02-21 Ada Draginda
#### Deprecations
* `extract_dbt_resources` has moved from a soft to a hard deprecation. Instead, use
`datahub_tools.dbt.extract_dbt_resources`
* `client.update_descriptions` has moved from a soft to a hard deprecation. Instead, use
`update_field_descriptions` or `update_dataset_description`

#### Changes
* DataHub posts are are now logged with fewer linebreaks and repeated spaces
* `client.get_datahub_entities` no longer returns schema data, by default. You can turn this feature
back on with the `with_schema` argument. This change was made for performance reasons.

### v0.4.0 - 2023-02-09 Ada Draginda
#### Changes
* Added new `get_owners` to the client module

### v0.3.0 - 2023-02-07 Ada Draginda
#### Changes
* Added a example on how to use transformers
* New DBT module for fetching DBT dependency lineage

### v0.2.0 - 2023-01-31 Ada Draginda
#### Deprecations
* `client.update_description` has been deprecated in favor of `client.update_field_descriptions`
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = 'setuptools.build_meta'
[project]
name = 'datahub_tools'
description = 'Python tools for working with DataHub'
version = '0.4.1'
version = '1.0.0'
readme = 'README.md'
requires-python = '>=3.7'
dependencies = [
Expand Down
139 changes: 66 additions & 73 deletions src/datahub_tools/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import logging
import os
import re
import warnings
from string import Template
from textwrap import dedent
from typing import Dict, Iterable, List
Expand Down Expand Up @@ -53,7 +52,9 @@ def datahub_post(body: Dict) -> Dict:
graphql_url = get_dh_graphql_url()
logger = logging.getLogger(__name__)
# the sub just condenses down the body e.g.: 'query: \n {...' -> 'query: {...'
logger.info("posting to %s: %s", graphql_url, re.sub(r"\n+\s*", "", str(body)))
# Note that the extra backslash is needed (\\n+) because body is a dict and calling
# str will inject additional escape characters.
logger.info("posting to %s: %s", graphql_url, re.sub(r"\\n+\s*", "", str(body)))

response = requests.post(url=graphql_url, headers=headers, json=body)
response.raise_for_status()
Expand Down Expand Up @@ -119,10 +120,16 @@ def emit_metadata(
emitter.emit(metadata_event)


def get_datahub_entities(start: int = 0, limit: int = 10000) -> List[DHEntity]:
def get_datahub_entities(
start: int = 0, limit: int = 10000, with_schema: bool = False
) -> List[DHEntity]:
"""
:param start: Index of the first record to return
:param limit: Maximum number of records to return (default and maximum per query is 10000).
:param limit: Maximum number of records to return (default and maximum per query
is 10000).
:param with_schema: If True (default is False) then schema fields and descriptions
will be retrieved (warning: may be slow or cause the DataHub endpoint to 503, in
which case you will need to retrieve your entities in chunks).
:return: Dictionary of snowflake name (e.g. prep.core.calendar) to DataHub urn
e.g. urn:li:dataset:(urn:li:dataPlatform:snowflake,prep.core.calendar,PROD)
"""
Expand Down Expand Up @@ -158,28 +165,9 @@ def get_datahub_entities(start: int = 0, limit: int = 10000) -> List[DHEntity]:
"""
)

query_body = Template(
dedent(
"""{
search(input: {type: DATASET, query: "*", start:$start, count: $limit})
{
start
count
searchResults {
entity {
urn
type
...on Dataset {
name
properties {
qualifiedName
description
customProperties {
...on CustomPropertiesEntry { key value }
}
}
editableProperties { description }
schemaMetadata {
schema_query = dedent(
"""
schemaMetadata {
fields {
...on SchemaField {
$field_vars
Expand All @@ -194,22 +182,50 @@ def get_datahub_entities(start: int = 0, limit: int = 10000) -> List[DHEntity]:
}
}
}
ownership {
owners {
owner {
...on CorpUser { urn }
...on CorpGroup { urn }
"""
)

query_body_template = Template(
dedent(
"""
{
search(input: {type: DATASET, query: "*", start:$start, count: $limit})
{
start
count
searchResults {
entity {
urn
type
...on Dataset {
name
properties {
qualifiedName
description
customProperties {
...on CustomPropertiesEntry { key value }
}
}
type
}
}
tags {
tags {
tag {
...on Tag {
urn
properties {
...on TagProperties { name }
editableProperties { description }
$schema_query
ownership {
owners {
owner {
...on CorpUser { urn }
...on CorpGroup { urn }
}
type
}
}
tags {
tags {
tag {
...on Tag {
urn
properties {
...on TagProperties { name }
}
}
}
}
}
Expand All @@ -218,10 +234,16 @@ def get_datahub_entities(start: int = 0, limit: int = 10000) -> List[DHEntity]:
}
}
}
}
}"""
"""
)
).substitute(field_vars=field_vars, start=start, limit=max(limit, 10000))
)

query_body = query_body_template.substitute(
field_vars=field_vars,
start=start,
limit=max(limit, 10000),
schema_query=schema_query if with_schema else "",
)

body = {"query": query_body, "variables": {}}
dh_entities = datahub_post(body=body)["data"]["search"]["searchResults"]
Expand Down Expand Up @@ -387,35 +409,6 @@ def update_institutional_memory(
return response[endpoint]


def update_description(
resource_urn: str, description: str, column: str | None = None
) -> bool:
"""
:param resource_urn: The entity/resource URN to update
:param description: The new description
:param column: If left out, then the description is updated for the entity. If provided, then the
description will be applied to this column (field).
:return:
"""
warnings.warn(
"update_description is deprecated and will be removed in the next release, "
"please use update_field_descriptions or update_dataset_description.",
DeprecationWarning,
)
subresource = (
f', subResourceType: DATASET_FIELD, subResource: "{column}"' if column else ""
)
body = {
"query": (
"mutation updateDescription { updateDescription(input: {"
f'resourceUrn: "{resource_urn}", description: "{description}"{subresource}'
"}) }"
),
"variables": {},
}
return datahub_post(body=body)["data"]["updateDescription"]


def set_group_owner(
group_urn: str, resource_urns: List[str], owner_type: str = TECHNICAL_OWNER
):
Expand Down
33 changes: 0 additions & 33 deletions src/datahub_tools/utils.py

This file was deleted.

0 comments on commit 618a777

Please sign in to comment.