Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(ingest/unity): use fully qualified catalog/schema patterns #7900

Merged
merged 3 commits into from
May 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ export const UNITY_CATALOG_ALLOW: RecipeField = {
label: 'Allow Patterns',
tooltip:
'Only include specific Catalogs by providing the name of a Catalog, or a Regular Expression (REGEX) to include specific Catalogs. If not provided, all Catalogs will be included.',
placeholder: 'my_catalog',
placeholder: 'metastore.my_catalog',
type: FieldType.LIST,
buttonLabel: 'Add pattern',
fieldPath: catalogAllowFieldPath,
Expand All @@ -106,11 +106,11 @@ export const UNITY_CATALOG_ALLOW: RecipeField = {

const catalogDenyFieldPath = 'source.config.catalog_pattern.deny';
export const UNITY_CATALOG_DENY: RecipeField = {
name: 'catalog_pattern.allow',
label: 'Allow Patterns',
name: 'catalog_pattern.deny',
label: 'Deny Patterns',
tooltip:
'Exclude specific Catalogs by providing the name of a Catalog, or a Regular Expression (REGEX) to exclude specific Catalogs. If not provided, all Catalogs will be included. Deny patterns always take precedence over Allow patterns.',
placeholder: 'my_catalog',
placeholder: 'metastore.my_catalog',
type: FieldType.LIST,
buttonLabel: 'Add pattern',
fieldPath: catalogDenyFieldPath,
Expand All @@ -120,9 +120,9 @@ export const UNITY_CATALOG_DENY: RecipeField = {
setListValuesOnRecipe(recipe, values, catalogDenyFieldPath),
};

const tableAllowFieldPath = 'source.config.metastore_id_pattern.allow';
const tableAllowFieldPath = 'source.config.table_pattern.allow';
export const UNITY_TABLE_ALLOW: RecipeField = {
name: 'catalog_pattern.allow',
name: 'table_pattern.allow',
label: 'Allow Patterns',
tooltip:
'Only include specific Tables by providing the fully-qualified name of a Table, or a Regular Expression (REGEX) to include specific Tables. If not provided, all Tables will be included.',
Expand All @@ -136,10 +136,10 @@ export const UNITY_TABLE_ALLOW: RecipeField = {
setListValuesOnRecipe(recipe, values, tableAllowFieldPath),
};

const tableDenyFieldPath = 'source.config.metastore_id_pattern.deny';
const tableDenyFieldPath = 'source.config.table_pattern.deny';
export const UNITY_TABLE_DENY: RecipeField = {
name: 'catalog_pattern.allow',
label: 'Allow Patterns',
name: 'table_pattern.deny',
label: 'Deny Patterns',
tooltip:
'Exclude specific Tables by providing the fully-qualified name of a Table, or a Regular Expression (REGEX) to exclude specific Tables. If not provided, all Tables will be included. Deny patterns always take precedence over Allow patterns.',
placeholder: 'catalog.schema.table',
Expand Down
19 changes: 14 additions & 5 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,27 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

### Breaking Changes

- #7900: The `catalog_pattern` and `schema_pattern` options of the Unity Catalog source now match against the fully qualified name of the catalog/schema instead of just the name. Unless you're using regex `^` in your patterns, this should not affect you.

### Potential Downtime

### Deprecations

### Other notable Changes

## 0.10.2

### Breaking Changes

- #7016 Add `add_database_name_to_urn` flag to Oracle source which ensure that Dataset urns have the DB name as a prefix to prevent collision (.e.g. {database}.{schema}.{table}). ONLY breaking if you set this flag to true, otherwise behavior remains the same.
- The Airflow plugin no longer includes the DataHub Kafka emitter by default. Use `pip install acryl-datahub-airflow-plugin[datahub-kafka]` for Kafka support.
- The Airflow lineage backend no longer includes the DataHub Kafka emitter by default. Use `pip install acryl-datahub[airflow,datahub-kafka]` for Kafka support.
- Java SDK PatchBuilders have been modified in a backwards incompatible way to align more with the Python SDK and support more use cases. Any application utilizing the Java SDK for patch building may be affected on upgrading this dependency.

### Potential Downtime

### Deprecations
- The docker image and script for updating from Elasticsearch 6 to 7 is no longer being maintained and will be removed from the `/contrib` section of
the repository. Please refer to older releases if needed.

### Other notable Changes
- The docker image and script for updating from Elasticsearch 6 to 7 is no longer being maintained and will be removed from the `/contrib` section of
the repository. Please refer to older releases if needed.

## 0.10.0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,17 @@ class UnityCatalogSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigM

catalog_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description="Regex patterns for catalogs to filter in ingestion. Specify regex to match the catalog name",
description="Regex patterns for catalogs to filter in ingestion. Specify regex to match the full `metastore.catalog` name.",
)

schema_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description="Regex patterns for schemas to filter in ingestion. Specify regex to only match the schema name. e.g. to match all tables in schema analytics, use the regex 'analytics'",
description="Regex patterns for schemas to filter in ingestion. Specify regex to the full `metastore.catalog.schema` name. e.g. to match all tables in schema analytics, use the regex `^mymetastore\\.mycatalog\\.analytics$`.",
)

table_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description="Regex patterns for tables to filter in ingestion. Specify regex to match the entire table name in catalog.schema.table format. e.g. to match all tables starting with customer in Customer catalog and public schema, use the regex 'Customer.public.customer.*'",
description="Regex patterns for tables to filter in ingestion. Specify regex to match the entire table name in `catalog.schema.table` format. e.g. to match all tables starting with customer in Customer catalog and public schema, use the regex `Customer\\.public\\.customer.*`.",
)
domain: Dict[str, AllowDenyPattern] = Field(
default=dict(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def process_catalogs(
self, metastore: proxy.Metastore
) -> Iterable[MetadataWorkUnit]:
for catalog in self.unity_catalog_api_proxy.catalogs(metastore=metastore):
if not self.config.catalog_pattern.allowed(catalog.name):
if not self.config.catalog_pattern.allowed(catalog.id):
self.report.catalogs.dropped(catalog.id)
continue

Expand All @@ -233,7 +233,7 @@ def process_catalogs(

def process_schemas(self, catalog: proxy.Catalog) -> Iterable[MetadataWorkUnit]:
for schema in self.unity_catalog_api_proxy.schemas(catalog=catalog):
if not self.config.schema_pattern.allowed(schema.name):
if not self.config.schema_pattern.allowed(schema.id):
self.report.schemas.dropped(schema.id)
continue

Expand Down