From 3eced4d100f8e7fe91827678bd321c242718ed94 Mon Sep 17 00:00:00 2001 From: Doug Beatty Date: Sun, 16 Jul 2023 21:06:16 -0600 Subject: [PATCH 1/4] Null safety for `deduplicate` via `row_alias` keyword argument --- macros/sql/deduplicate.sql | 74 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/macros/sql/deduplicate.sql b/macros/sql/deduplicate.sql index 3e75579c..edb2bfc9 100644 --- a/macros/sql/deduplicate.sql +++ b/macros/sql/deduplicate.sql @@ -1,8 +1,40 @@ {%- macro deduplicate(relation, partition_by, order_by) -%} - {{ return(adapter.dispatch('deduplicate', 'dbt_utils')(relation, partition_by, order_by)) }} + {{ return(adapter.dispatch('deduplicate', 'dbt_utils')(relation, partition_by, order_by, **kwargs)) }} {% endmacro %} -{%- macro default__deduplicate(relation, partition_by, order_by) -%} +{# +-- ⚠️ This macro drops rows that contain NULL values ⚠️ + +-- The implementation below uses a natural join which avoids returning an +-- extra column at the cost of not being null safe. + +-- dbt_utils._safe_deduplicate is an alternative that avoids dropping rows +-- that contain NULL values at the cost of adding an extra column. +#} +{%- macro _unsafe_deduplicate(relation, partition_by, order_by) -%} + +{%- set error_message = " +Warning: the implementation of the `deduplicate` macro for the `{}` adapter is not null safe. \ + +Set `row_alias` within calls to `deduplicate` to achieve null safety (which will also add it \ +as an extra column to the output). + +e.g., + {{ + dbt_utils.deduplicate( + 'my_cte', + partition_by='user_id', + order_by='version desc', + row_alias='rn' + ) | indent + }} + +Warning triggered by model: {}.{} +dbt project / package: {} +path: {} +".format(target.type, model.package_name, model.name, model.package_name, model.original_file_path) -%} + +{%- do exceptions.warn(error_message) -%} with row_numbered as ( select @@ -29,6 +61,44 @@ {%- endmacro -%} +{# +-- For data platforms that don't support QUALIFY or an equivalent, the +-- best we can do to ensure null safety is to use a window function + +-- filter (which returns an extra column): +-- https://modern-sql.com/caniuse/qualify +#} +{%- macro _safe_deduplicate(relation, partition_by, order_by, row_alias="rn") -%} + + with row_numbered as ( + select + _inner.*, + row_number() over ( + partition by {{ partition_by }} + order by {{ order_by }} + ) as {{ row_alias }} + from {{ relation }} as _inner + ) + + select * + from row_numbered + where {{ row_alias }} = 1 + +{%- endmacro -%} + +{# +-- ⚠️ This macro drops rows that contain NULL values unless the `row_alias` parameter is included ⚠️ +#} +{%- macro default__deduplicate(relation, partition_by, order_by) -%} + {% set row_alias = kwargs.get('row_alias') %} + + {% if row_alias != None %} + {{ dbt_utils._safe_deduplicate(relation, partition_by, order_by, row_alias=row_alias) }} + {% else %} + {{ dbt_utils._unsafe_deduplicate(relation, partition_by, order_by) }} + {% endif %} + +{%- endmacro -%} + -- Redshift has the `QUALIFY` syntax: -- https://docs.aws.amazon.com/redshift/latest/dg/r_QUALIFY_clause.html {% macro redshift__deduplicate(relation, partition_by, order_by) -%} From d46676e98bd3e0c5f3d4e5056ff83c765720a3a0 Mon Sep 17 00:00:00 2001 From: Doug Beatty Date: Sun, 16 Jul 2023 21:06:16 -0600 Subject: [PATCH 2/4] Null safety for `deduplicate` via `columns` keyword argument --- macros/sql/deduplicate.sql | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/macros/sql/deduplicate.sql b/macros/sql/deduplicate.sql index edb2bfc9..f376344a 100644 --- a/macros/sql/deduplicate.sql +++ b/macros/sql/deduplicate.sql @@ -67,11 +67,23 @@ path: {} -- filter (which returns an extra column): -- https://modern-sql.com/caniuse/qualify #} -{%- macro _safe_deduplicate(relation, partition_by, order_by, row_alias="rn") -%} +{%- macro _safe_deduplicate(relation, partition_by, order_by, row_alias="rn", columns=none) -%} + + {% if not row_alias %} + {% set row_alias = "rn" %} + {% endif %} with row_numbered as ( select + + {% if columns != None %} + {% for column in columns %} + {{ column }}, + {% endfor %} + {% else %} _inner.*, + {% endif %} + row_number() over ( partition by {{ partition_by }} order by {{ order_by }} @@ -90,9 +102,10 @@ path: {} #} {%- macro default__deduplicate(relation, partition_by, order_by) -%} {% set row_alias = kwargs.get('row_alias') %} + {% set columns = kwargs.get('columns') %} - {% if row_alias != None %} - {{ dbt_utils._safe_deduplicate(relation, partition_by, order_by, row_alias=row_alias) }} + {% if row_alias != None or columns != None %} + {{ dbt_utils._safe_deduplicate(relation, partition_by, order_by, row_alias=row_alias, columns=columns) }} {% else %} {{ dbt_utils._unsafe_deduplicate(relation, partition_by, order_by) }} {% endif %} From fe03f43cc5f57d86006addfa1770ab78230cd725 Mon Sep 17 00:00:00 2001 From: Doug Beatty Date: Sun, 16 Jul 2023 21:06:16 -0600 Subject: [PATCH 3/4] Null safety for `deduplicate` when `relation` is not a CTE --- macros/sql/deduplicate.sql | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/macros/sql/deduplicate.sql b/macros/sql/deduplicate.sql index f376344a..d4e8cef0 100644 --- a/macros/sql/deduplicate.sql +++ b/macros/sql/deduplicate.sql @@ -104,7 +104,10 @@ path: {} {% set row_alias = kwargs.get('row_alias') %} {% set columns = kwargs.get('columns') %} - {% if row_alias != None or columns != None %} + {% if relation.is_cte is defined and not relation.is_cte %} + {% set columns = dbt_utils.get_filtered_columns_in_relation(relation) %} + {{ dbt_utils._safe_deduplicate(relation, partition_by, order_by, columns=columns) }} + {% elif row_alias != None or columns != None %} {{ dbt_utils._safe_deduplicate(relation, partition_by, order_by, row_alias=row_alias, columns=columns) }} {% else %} {{ dbt_utils._unsafe_deduplicate(relation, partition_by, order_by) }} From e13d72d53bb5f134aedb0f1320be8988bf840e6c Mon Sep 17 00:00:00 2001 From: Doug Beatty Date: Sun, 16 Jul 2023 21:06:16 -0600 Subject: [PATCH 4/4] Update caveats --- macros/sql/deduplicate.sql | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/macros/sql/deduplicate.sql b/macros/sql/deduplicate.sql index d4e8cef0..afa7a76e 100644 --- a/macros/sql/deduplicate.sql +++ b/macros/sql/deduplicate.sql @@ -98,7 +98,10 @@ path: {} {%- endmacro -%} {# --- ⚠️ This macro drops rows that contain NULL values unless the `row_alias` parameter is included ⚠️ +-- ⚠️ This macro drops rows that contain NULL values unless one of the following is true: +-- - `relation` parameter is a non-CTE dbt Relation +-- - `row_alias` parameter is included +-- - `columns` parameter is included #} {%- macro default__deduplicate(relation, partition_by, order_by) -%} {% set row_alias = kwargs.get('row_alias') %}