From a7f4f51fe1d9883150f01f623e83f1ab197efcbb Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Wed, 23 Feb 2022 15:53:49 +1300 Subject: [PATCH 1/8] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bab3a503..f22c3809 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ For compatibility details between versions of dbt-core and dbt-utils, [see this [Materializations](#materializations): - [insert_by_period](#insert_by_period-source) ---- +---- ### Schema Tests #### equal_rowcount ([source](macros/schema_tests/equal_rowcount.sql)) This schema test asserts the that two relations have the same number of rows. From 168396fb2babd889ed2acba6fb8242563fd4bf27 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Mon, 28 Feb 2022 15:51:08 +1300 Subject: [PATCH 2/8] Mutually excl range examples in disclosure triangle --- README.md | 99 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index f22c3809..709640b3 100644 --- a/README.md +++ b/README.md @@ -377,53 +377,58 @@ models: partition_by: customer_id gaps: allowed ``` - -**Understanding the `gaps` argument:** -Here are a number of examples for each allowed `gaps` argument. -* `gaps: not_allowed`: The upper bound of one record must be the lower bound of -the next record. - -| lower_bound | upper_bound | -|-------------|-------------| -| 0 | 1 | -| 1 | 2 | -| 2 | 3 | - -* `gaps: allowed` (default): There may be a gap between the upper bound of one -record and the lower bound of the next record. - -| lower_bound | upper_bound | -|-------------|-------------| -| 0 | 1 | -| 2 | 3 | -| 3 | 4 | - -* `gaps: required`: There must be a gap between the upper bound of one record and -the lower bound of the next record (common for date ranges). - -| lower_bound | upper_bound | -|-------------|-------------| -| 0 | 1 | -| 2 | 3 | -| 4 | 5 | - -**Understanding the `zero_length_range_allowed` argument:** -Here are a number of examples for each allowed `zero_length_range_allowed` argument. -* `zero_length_range_allowed: false`: (default) The upper bound of each record must be greater than its lower bound. - -| lower_bound | upper_bound | -|-------------|-------------| -| 0 | 1 | -| 1 | 2 | -| 2 | 3 | - -* `zero_length_range_allowed: true`: The upper bound of each record can be greater than or equal to its lower bound. - -| lower_bound | upper_bound | -|-------------|-------------| -| 0 | 1 | -| 2 | 2 | -| 3 | 4 | +
+Additional `gaps` and `zero_length_range_allowed` examples + + **Understanding the `gaps` argument:** + + Here are a number of examples for each allowed `gaps` argument. + * `gaps: not_allowed`: The upper bound of one record must be the lower bound of + the next record. + + | lower_bound | upper_bound | + |-------------|-------------| + | 0 | 1 | + | 1 | 2 | + | 2 | 3 | + + * `gaps: allowed` (default): There may be a gap between the upper bound of one + record and the lower bound of the next record. + + | lower_bound | upper_bound | + |-------------|-------------| + | 0 | 1 | + | 2 | 3 | + | 3 | 4 | + + * `gaps: required`: There must be a gap between the upper bound of one record and + the lower bound of the next record (common for date ranges). + + | lower_bound | upper_bound | + |-------------|-------------| + | 0 | 1 | + | 2 | 3 | + | 4 | 5 | + + **Understanding the `zero_length_range_allowed` argument:** + Here are a number of examples for each allowed `zero_length_range_allowed` argument. + * `zero_length_range_allowed: false`: (default) The upper bound of each record must be greater than its lower bound. + + | lower_bound | upper_bound | + |-------------|-------------| + | 0 | 1 | + | 1 | 2 | + | 2 | 3 | + + * `zero_length_range_allowed: true`: The upper bound of each record can be greater than or equal to its lower bound. + + | lower_bound | upper_bound | + |-------------|-------------| + | 0 | 1 | + | 2 | 2 | + | 3 | 4 | + +
#### sequential_values ([source](macros/schema_tests/sequential_values.sql)) This test confirms that a column contains sequential values. It can be used From 5717b109d358844e0f085565b93e74cbe469ecb0 Mon Sep 17 00:00:00 2001 From: Joel Labes Date: Thu, 3 Mar 2022 14:24:19 +1300 Subject: [PATCH 3/8] Fix union_relations error when no include/exclude provided * Fix union_relations error when no include/exclude provided (#509) * Update CHANGELOG.md --- CHANGELOG.md | 4 ++++ macros/sql/union.sql | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67b690e7..f346c55b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# dbt-utils v0.8.2 +## Fixes +- Fix union_relations error from [#473](https://github.com/dbt-labs/dbt-utils/pull/473) when no include/exclude parameters are provided ([#505](https://github.com/dbt-labs/dbt-utils/issues/505), [#509](https://github.com/dbt-labs/dbt-utils/pull/509)) + # dbt-utils v0.8.1 ## New features diff --git a/macros/sql/union.sql b/macros/sql/union.sql index 009a765a..a7bf1d95 100644 --- a/macros/sql/union.sql +++ b/macros/sql/union.sql @@ -61,7 +61,7 @@ {%- set ordered_column_names = column_superset.keys() -%} - {%- if not column_superset.keys() -%} + {% if (include | length > 0 or exclude | length > 0) and not column_superset.keys() %} {%- set relations_string -%} {%- for relation in relations -%} {{ relation.name }} From ea38b6b3e66c6050f642c2d5228ee68bd19120e2 Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Sun, 6 Mar 2022 15:55:16 +0000 Subject: [PATCH 4/8] Add dedupe macro --- macros/sql/dedupe.sql | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 macros/sql/dedupe.sql diff --git a/macros/sql/dedupe.sql b/macros/sql/dedupe.sql new file mode 100644 index 00000000..0d875db1 --- /dev/null +++ b/macros/sql/dedupe.sql @@ -0,0 +1,41 @@ +{%- macro dedupe(table, group_by, order_by=none) -%} + {{ return(adapter.dispatch('dedupe', 'dbt_utils')(table, group_by, order_by=order_by)) }} +{% endmacro %} + +{%- macro default__dedupe(table, group_by, order_by=none) -%} + + select + {{ dbt_utils.star(table, relation_alias='deduped') | indent }} + from ( + select + _inner.*, + row_number() over ( + partition by {{ group_by }} + {% if order_by != none -%} + order by {{ order_by }} + {%- endif %} + ) as rn + from {{ table }} as _inner + ) as deduped + where deduped.rn = 1 + +{%- endmacro -%} + +{%- macro bigquery__dedupe(table, group_by, order_by=none) -%} + + select + {{ dbt_utils.star(table, relation_alias='deduped') | indent }} + from ( + select + array_agg ( + original + {% if order_by != none -%} + order by {{ order_by }} + {%- endif %} + limit 1 + )[offset(0)] as deduped + from {{ table }} as original + group by {{ group_by }} + ) + +{%- endmacro -%} From 821c1d0f69c38c61aa14111416076237f9f0fc31 Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Sun, 6 Mar 2022 16:07:44 +0000 Subject: [PATCH 5/8] Add test for dedupe macro --- integration_tests/data/sql/data_dedupe.csv | 3 +++ integration_tests/data/sql/data_dedupe_expected.csv | 2 ++ integration_tests/models/sql/schema.yml | 9 +++++++-- integration_tests/models/sql/test_dedupe.sql | 7 +++++++ 4 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 integration_tests/data/sql/data_dedupe.csv create mode 100644 integration_tests/data/sql/data_dedupe_expected.csv create mode 100644 integration_tests/models/sql/test_dedupe.sql diff --git a/integration_tests/data/sql/data_dedupe.csv b/integration_tests/data/sql/data_dedupe.csv new file mode 100644 index 00000000..c3ae0c4d --- /dev/null +++ b/integration_tests/data/sql/data_dedupe.csv @@ -0,0 +1,3 @@ +user_id,event,version +1,play,1 +1,play,2 diff --git a/integration_tests/data/sql/data_dedupe_expected.csv b/integration_tests/data/sql/data_dedupe_expected.csv new file mode 100644 index 00000000..de5e204d --- /dev/null +++ b/integration_tests/data/sql/data_dedupe_expected.csv @@ -0,0 +1,2 @@ +user_id,event,version +1,play,2 diff --git a/integration_tests/models/sql/schema.yml b/integration_tests/models/sql/schema.yml index 616dd386..6fbe4866 100644 --- a/integration_tests/models/sql/schema.yml +++ b/integration_tests/models/sql/schema.yml @@ -85,7 +85,7 @@ models: tests: - dbt_utils.equality: compare_model: ref('data_pivot_expected') - + - name: test_pivot_apostrophe tests: - dbt_utils.equality: @@ -137,8 +137,13 @@ models: tests: - dbt_utils.equality: compare_model: ref('data_union_expected') - + - name: test_get_relations_by_pattern tests: - dbt_utils.equality: compare_model: ref('data_union_events_expected') + + - name: test_dedupe + tests: + - dbt_utils.equality: + compare_model: ref('data_dedupe_expected') diff --git a/integration_tests/models/sql/test_dedupe.sql b/integration_tests/models/sql/test_dedupe.sql new file mode 100644 index 00000000..1d078002 --- /dev/null +++ b/integration_tests/models/sql/test_dedupe.sql @@ -0,0 +1,7 @@ +WITH deduped as ( + + {{ dbt_utils.dedupe(ref('data_dedupe'), group_by='user_id', order_by='version desc') | indent }} + +) + +select * from deduped From f88d49b484262dd3e9baaceca3e0cb5b76c1333a Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Sun, 6 Mar 2022 17:58:41 +0000 Subject: [PATCH 6/8] Add documentation to README --- README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 709640b3..10e615ef 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ For compatibility details between versions of dbt-core and dbt-utils, [see this - [SQL generators](#sql-generators) - [date_spine](#date_spine-source) + - [dedupe](#dedupe-source) - [haversine_distance](#haversine_distance-source) - [group_by](#group_by-source) - [star](#star-source) @@ -705,6 +706,20 @@ This macro returns the sql required to build a date spine. The spine will includ }} ``` +#### dedupe ([source](macros/sql/dedupe.sql)) +This macro returns the sql required to remove deduplicate rows from a model or source. + +**Usage:** + +``` +{{ dbt_utils.dedupe( + source('my_source', 'my_table'), + "user_id, cast(timestamp as day)", + order_by="timestamp desc" + ) +}} +``` + #### haversine_distance ([source](macros/sql/haversine_distance.sql)) This macro calculates the [haversine distance](http://daynebatten.com/2015/09/latitude-longitude-distance-sql/) between a pair of x/y coordinates. @@ -747,7 +762,7 @@ group by 1,2,3 ``` #### star ([source](macros/sql/star.sql)) -This macro generates a comma-separated list of all fields that exist in the `from` relation, excluding any fields listed in the `except` argument. The construction is identical to `select * from {{ref('my_model')}}`, replacing star (`*`) with the star macro. This macro also has an optional `relation_alias` argument that will prefix all generated fields with an alias (`relation_alias`.`field_name`). +This macro generates a comma-separated list of all fields that exist in the `from` relation, excluding any fields listed in the `except` argument. The construction is identical to `select * from {{ref('my_model')}}`, replacing star (`*`) with the star macro. This macro also has an optional `relation_alias` argument that will prefix all generated fields with an alias (`relation_alias`.`field_name`). The macro also has optional `prefix` and `suffix` arguments. When one or both are provided, they will be concatenated onto each field's alias in the output (`prefix` ~ `field_name` ~ `suffix`). NB: This prevents the output from being used in any context other than a select statement. From e85023d64e8402fd968586247406b4b7c184ab6a Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Sun, 6 Mar 2022 18:03:41 +0000 Subject: [PATCH 7/8] Add entry to CHANGELOG --- CHANGELOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f346c55b..c9046d7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# dbt-utils v0.8.3 +## New features +- A macro for deduplicating data ([#335](https://github.com/dbt-labs/dbt-utils/issues/335), [#512](https://github.com/dbt-labs/dbt-utils/pull/512)) + # dbt-utils v0.8.2 ## Fixes - Fix union_relations error from [#473](https://github.com/dbt-labs/dbt-utils/pull/473) when no include/exclude parameters are provided ([#505](https://github.com/dbt-labs/dbt-utils/issues/505), [#509](https://github.com/dbt-labs/dbt-utils/pull/509)) @@ -32,7 +36,7 @@ # dbt-utils v0.8.0 ## 🚨 Breaking changes -- dbt ONE POINT OH is here! This version of dbt-utils requires _any_ version (minor and patch) of v1, which means far less need for compatibility releases in the future. +- dbt ONE POINT OH is here! This version of dbt-utils requires _any_ version (minor and patch) of v1, which means far less need for compatibility releases in the future. - The partition column in the `mutually_exclusive_ranges` test is now always called `partition_by_col`. This enables compatibility with `--store-failures` when multiple columns are concatenated together. If you have models built on top of the failures table, update them to reflect the new column name. ([#423](https://github.com/dbt-labs/dbt-utils/issues/423), [#430](https://github.com/dbt-labs/dbt-utils/pull/430)) ## Contributors: From ecaa396cfa96aef62fb52fb6855997bb73f829db Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Mon, 7 Mar 2022 08:57:14 +0000 Subject: [PATCH 8/8] Implement review --- README.md | 10 ++-- .../{data_dedupe.csv => data_deduplicate.csv} | 0 ...cted.csv => data_deduplicate_expected.csv} | 0 integration_tests/models/sql/schema.yml | 2 +- integration_tests/models/sql/test_dedupe.sql | 7 --- .../models/sql/test_deduplicate.sql | 7 +++ macros/sql/dedupe.sql | 41 ----------------- macros/sql/deduplicate.sql | 46 +++++++++++++++++++ 8 files changed, 59 insertions(+), 54 deletions(-) rename integration_tests/data/sql/{data_dedupe.csv => data_deduplicate.csv} (100%) rename integration_tests/data/sql/{data_dedupe_expected.csv => data_deduplicate_expected.csv} (100%) delete mode 100644 integration_tests/models/sql/test_dedupe.sql create mode 100644 integration_tests/models/sql/test_deduplicate.sql delete mode 100644 macros/sql/dedupe.sql create mode 100644 macros/sql/deduplicate.sql diff --git a/README.md b/README.md index 10e615ef..adf0bbc3 100644 --- a/README.md +++ b/README.md @@ -706,15 +706,15 @@ This macro returns the sql required to build a date spine. The spine will includ }} ``` -#### dedupe ([source](macros/sql/dedupe.sql)) -This macro returns the sql required to remove deduplicate rows from a model or source. +#### deduplicate ([source](macros/sql/deduplicate.sql)) +This macro returns the sql required to remove duplicate rows from a model or source. **Usage:** ``` -{{ dbt_utils.dedupe( - source('my_source', 'my_table'), - "user_id, cast(timestamp as day)", +{{ dbt_utils.deduplicate( + relation=source('my_source', 'my_table'), + group_by="user_id, cast(timestamp as day)", order_by="timestamp desc" ) }} diff --git a/integration_tests/data/sql/data_dedupe.csv b/integration_tests/data/sql/data_deduplicate.csv similarity index 100% rename from integration_tests/data/sql/data_dedupe.csv rename to integration_tests/data/sql/data_deduplicate.csv diff --git a/integration_tests/data/sql/data_dedupe_expected.csv b/integration_tests/data/sql/data_deduplicate_expected.csv similarity index 100% rename from integration_tests/data/sql/data_dedupe_expected.csv rename to integration_tests/data/sql/data_deduplicate_expected.csv diff --git a/integration_tests/models/sql/schema.yml b/integration_tests/models/sql/schema.yml index 6fbe4866..ad990260 100644 --- a/integration_tests/models/sql/schema.yml +++ b/integration_tests/models/sql/schema.yml @@ -146,4 +146,4 @@ models: - name: test_dedupe tests: - dbt_utils.equality: - compare_model: ref('data_dedupe_expected') + compare_model: ref('data_deduplicate_expected') diff --git a/integration_tests/models/sql/test_dedupe.sql b/integration_tests/models/sql/test_dedupe.sql deleted file mode 100644 index 1d078002..00000000 --- a/integration_tests/models/sql/test_dedupe.sql +++ /dev/null @@ -1,7 +0,0 @@ -WITH deduped as ( - - {{ dbt_utils.dedupe(ref('data_dedupe'), group_by='user_id', order_by='version desc') | indent }} - -) - -select * from deduped diff --git a/integration_tests/models/sql/test_deduplicate.sql b/integration_tests/models/sql/test_deduplicate.sql new file mode 100644 index 00000000..7df79261 --- /dev/null +++ b/integration_tests/models/sql/test_deduplicate.sql @@ -0,0 +1,7 @@ +with deduped as ( + + {{ dbt_utils.deduplicate(ref('data_deduplicate'), group_by='user_id', order_by='version desc') | indent }} + +) + +select * from deduped diff --git a/macros/sql/dedupe.sql b/macros/sql/dedupe.sql deleted file mode 100644 index 0d875db1..00000000 --- a/macros/sql/dedupe.sql +++ /dev/null @@ -1,41 +0,0 @@ -{%- macro dedupe(table, group_by, order_by=none) -%} - {{ return(adapter.dispatch('dedupe', 'dbt_utils')(table, group_by, order_by=order_by)) }} -{% endmacro %} - -{%- macro default__dedupe(table, group_by, order_by=none) -%} - - select - {{ dbt_utils.star(table, relation_alias='deduped') | indent }} - from ( - select - _inner.*, - row_number() over ( - partition by {{ group_by }} - {% if order_by != none -%} - order by {{ order_by }} - {%- endif %} - ) as rn - from {{ table }} as _inner - ) as deduped - where deduped.rn = 1 - -{%- endmacro -%} - -{%- macro bigquery__dedupe(table, group_by, order_by=none) -%} - - select - {{ dbt_utils.star(table, relation_alias='deduped') | indent }} - from ( - select - array_agg ( - original - {% if order_by != none -%} - order by {{ order_by }} - {%- endif %} - limit 1 - )[offset(0)] as deduped - from {{ table }} as original - group by {{ group_by }} - ) - -{%- endmacro -%} diff --git a/macros/sql/deduplicate.sql b/macros/sql/deduplicate.sql new file mode 100644 index 00000000..f5d65534 --- /dev/null +++ b/macros/sql/deduplicate.sql @@ -0,0 +1,46 @@ +{%- macro deduplicate(relation, group_by, order_by=none) -%} + {{ return(adapter.dispatch('deduplicate', 'dbt_utils')(relation, group_by, order_by=order_by)) }} +{% endmacro %} + +{%- macro default__deduplicate(relation, group_by, order_by=none) -%} + + select + {{ dbt_utils.star(relation, relation_alias='deduped') | indent }} + from ( + select + _inner.*, + row_number() over ( + partition by {{ group_by }} + {% if order_by is not none -%} + order by {{ order_by }} + {%- endif %} + ) as rn + from {{ relation }} as _inner + ) as deduped + where deduped.rn = 1 + +{%- endmacro -%} + +{# +-- It is more performant to deduplicate using `array_agg` with a limit +-- clause in BigQuery: +-- https://github.com/dbt-labs/dbt-utils/issues/335#issuecomment-788157572 +#} +{%- macro bigquery__deduplicate(relation, group_by, order_by=none) -%} + + select + {{ dbt_utils.star(relation, relation_alias='deduped') | indent }} + from ( + select + array_agg ( + original + {% if order_by is not none -%} + order by {{ order_by }} + {%- endif %} + limit 1 + )[offset(0)] as deduped + from {{ relation }} as original + group by {{ group_by }} + ) + +{%- endmacro -%}