Skip to content

Commit

Permalink
[PECO-921] (Reimplemented) Fix: allow DESCRIBE TABLE EXTENDED to hand…
Browse files Browse the repository at this point in the history
…le more than 2048 characters (#405)

---------

Signed-off-by: Jesse Whitehouse <[email protected]>
  • Loading branch information
Jesse committed Aug 29, 2023
1 parent b173a6d commit b10eabf
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## dbt-databricks 1.6.x (Release TBD)

### Features

- Follow up: re-implement fix for issue where the show tables extended command is limited to 2048 characters. ([#326](https://github.com/databricks/dbt-databricks/pull/326)). Set `DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS` to `true` to enable this behaviour.

## dbt-databricks 1.6.1 (August 2, 2023)

### Fixes
Expand Down
20 changes: 19 additions & 1 deletion dbt/adapters/databricks/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from contextlib import contextmanager
from itertools import chain
from dataclasses import dataclass
import os
import re
from typing import (
Any,
Expand Down Expand Up @@ -79,6 +80,22 @@ def check_not_found_error(errmsg: str) -> bool:
return new_error or old_error is not None


def get_identifier_list_string(table_names: Set[str]) -> str:
"""Returns `"|".join(table_names)` by default.
Returns `"*"` if `DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS` == `"true"`
and the joined string exceeds 2048 characters
This is for AWS Glue Catalog users. See issue #325.
"""

_identifier = "|".join(table_names)
bypass_2048_char_limit = os.environ.get("DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS", "false")
if bypass_2048_char_limit == "true":
_identifier = _identifier if len(_identifier) < 2048 else "*"
return _identifier


@undefined_proof
class DatabricksAdapter(SparkAdapter):
Relation = DatabricksRelation
Expand Down Expand Up @@ -448,11 +465,12 @@ def _get_one_catalog(
table_names.add(relation.identifier)

columns: List[Dict[str, Any]] = []

if len(table_names) > 0:
schema_relation = self.Relation.create(
database=database,
schema=schema,
identifier="|".join(table_names),
identifier=get_identifier_list_string(table_names),
quote_policy=self.config.quoting,
)
for relation, information in self._list_relations_with_information(schema_relation):
Expand Down
64 changes: 64 additions & 0 deletions tests/unit/test_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from dbt.adapters.databricks import __version__
from dbt.adapters.databricks import DatabricksAdapter, DatabricksRelation
from dbt.adapters.databricks.impl import check_not_found_error
from dbt.adapters.databricks.impl import get_identifier_list_string
from dbt.adapters.databricks.connections import (
CATALOG_KEY_IN_SESSION_PROPERTIES,
DBT_DATABRICKS_INVOCATION_ENV,
Expand Down Expand Up @@ -947,6 +948,69 @@ def test_parse_columns_from_information_with_table_type_and_parquet_provider(sel
},
)

def test_describe_table_extended_2048_char_limit(self):
"""GIVEN a list of table_names whos total character length exceeds 2048 characters
WHEN the environment variable DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS is "true"
THEN the identifier list is replaced with "*"
"""

table_names: set(str) = set([f"customers_{i}" for i in range(200)])

# By default, don't limit the number of characters
self.assertEqual(get_identifier_list_string(table_names), "|".join(table_names))

# If environment variable is set, then limit the number of characters
with mock.patch.dict("os.environ", **{"DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS": "true"}):

# Long list of table names is capped
self.assertEqual(get_identifier_list_string(table_names), "*")

# Short list of table names is not capped
self.assertEqual(
get_identifier_list_string(list(table_names)[:5]), "|".join(list(table_names)[:5])
)

def test_describe_table_extended_should_not_limit(self):
"""GIVEN a list of table_names whos total character length exceeds 2048 characters
WHEN the environment variable DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS is not set
THEN the identifier list is not truncated
"""

table_names: set(str) = set([f"customers_{i}" for i in range(200)])

# By default, don't limit the number of characters
self.assertEqual(get_identifier_list_string(table_names), "|".join(table_names))

def test_describe_table_extended_should_limit(self):
"""GIVEN a list of table_names whos total character length exceeds 2048 characters
WHEN the environment variable DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS is "true"
THEN the identifier list is replaced with "*"
"""

table_names: set(str) = set([f"customers_{i}" for i in range(200)])

# If environment variable is set, then limit the number of characters
with mock.patch.dict("os.environ", **{"DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS": "true"}):

# Long list of table names is capped
self.assertEqual(get_identifier_list_string(table_names), "*")

def test_describe_table_extended_may_limit(self):
"""GIVEN a list of table_names whos total character length does not 2048 characters
WHEN the environment variable DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS is "true"
THEN the identifier list is not truncated
"""

table_names: set(str) = set([f"customers_{i}" for i in range(200)])

# If environment variable is set, then we may limit the number of characters
with mock.patch.dict("os.environ", **{"DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS": "true"}):

# But a short list of table names is not capped
self.assertEqual(
get_identifier_list_string(list(table_names)[:5]), "|".join(list(table_names)[:5])
)


class TestCheckNotFound(unittest.TestCase):
def test_prefix(self):
Expand Down

0 comments on commit b10eabf

Please sign in to comment.