diff --git a/CHANGELOG.md b/CHANGELOG.md index f346c55b..c9046d7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# dbt-utils v0.8.3 +## New features +- A macro for deduplicating data ([#335](https://github.com/dbt-labs/dbt-utils/issues/335), [#512](https://github.com/dbt-labs/dbt-utils/pull/512)) + # dbt-utils v0.8.2 ## Fixes - Fix union_relations error from [#473](https://github.com/dbt-labs/dbt-utils/pull/473) when no include/exclude parameters are provided ([#505](https://github.com/dbt-labs/dbt-utils/issues/505), [#509](https://github.com/dbt-labs/dbt-utils/pull/509)) @@ -32,7 +36,7 @@ # dbt-utils v0.8.0 ## 🚨 Breaking changes -- dbt ONE POINT OH is here! This version of dbt-utils requires _any_ version (minor and patch) of v1, which means far less need for compatibility releases in the future. +- dbt ONE POINT OH is here! This version of dbt-utils requires _any_ version (minor and patch) of v1, which means far less need for compatibility releases in the future. - The partition column in the `mutually_exclusive_ranges` test is now always called `partition_by_col`. This enables compatibility with `--store-failures` when multiple columns are concatenated together. If you have models built on top of the failures table, update them to reflect the new column name. ([#423](https://github.com/dbt-labs/dbt-utils/issues/423), [#430](https://github.com/dbt-labs/dbt-utils/pull/430)) ## Contributors: diff --git a/README.md b/README.md index f707a376..ae922a1f 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ For compatibility details between versions of dbt-core and dbt-utils, [see this - [SQL generators](#sql-generators) - [date_spine](#date_spine-source) + - [dedupe](#dedupe-source) - [haversine_distance](#haversine_distance-source) - [group_by](#group_by-source) - [star](#star-source) @@ -706,6 +707,20 @@ This macro returns the sql required to build a date spine. The spine will includ }} ``` +#### deduplicate ([source](macros/sql/deduplicate.sql)) +This macro returns the sql required to remove duplicate rows from a model or source. + +**Usage:** + +``` +{{ dbt_utils.deduplicate( + relation=source('my_source', 'my_table'), + group_by="user_id, cast(timestamp as day)", + order_by="timestamp desc" + ) +}} +``` + #### haversine_distance ([source](macros/sql/haversine_distance.sql)) This macro calculates the [haversine distance](http://daynebatten.com/2015/09/latitude-longitude-distance-sql/) between a pair of x/y coordinates. @@ -748,7 +763,7 @@ group by 1,2,3 ``` #### star ([source](macros/sql/star.sql)) -This macro generates a comma-separated list of all fields that exist in the `from` relation, excluding any fields listed in the `except` argument. The construction is identical to `select * from {{ref('my_model')}}`, replacing star (`*`) with the star macro. This macro also has an optional `relation_alias` argument that will prefix all generated fields with an alias (`relation_alias`.`field_name`). +This macro generates a comma-separated list of all fields that exist in the `from` relation, excluding any fields listed in the `except` argument. The construction is identical to `select * from {{ref('my_model')}}`, replacing star (`*`) with the star macro. This macro also has an optional `relation_alias` argument that will prefix all generated fields with an alias (`relation_alias`.`field_name`). The macro also has optional `prefix` and `suffix` arguments. When one or both are provided, they will be concatenated onto each field's alias in the output (`prefix` ~ `field_name` ~ `suffix`). NB: This prevents the output from being used in any context other than a select statement. diff --git a/integration_tests/data/sql/data_deduplicate.csv b/integration_tests/data/sql/data_deduplicate.csv new file mode 100644 index 00000000..c3ae0c4d --- /dev/null +++ b/integration_tests/data/sql/data_deduplicate.csv @@ -0,0 +1,3 @@ +user_id,event,version +1,play,1 +1,play,2 diff --git a/integration_tests/data/sql/data_deduplicate_expected.csv b/integration_tests/data/sql/data_deduplicate_expected.csv new file mode 100644 index 00000000..de5e204d --- /dev/null +++ b/integration_tests/data/sql/data_deduplicate_expected.csv @@ -0,0 +1,2 @@ +user_id,event,version +1,play,2 diff --git a/integration_tests/models/sql/schema.yml b/integration_tests/models/sql/schema.yml index 616dd386..ad990260 100644 --- a/integration_tests/models/sql/schema.yml +++ b/integration_tests/models/sql/schema.yml @@ -85,7 +85,7 @@ models: tests: - dbt_utils.equality: compare_model: ref('data_pivot_expected') - + - name: test_pivot_apostrophe tests: - dbt_utils.equality: @@ -137,8 +137,13 @@ models: tests: - dbt_utils.equality: compare_model: ref('data_union_expected') - + - name: test_get_relations_by_pattern tests: - dbt_utils.equality: compare_model: ref('data_union_events_expected') + + - name: test_dedupe + tests: + - dbt_utils.equality: + compare_model: ref('data_deduplicate_expected') diff --git a/integration_tests/models/sql/test_deduplicate.sql b/integration_tests/models/sql/test_deduplicate.sql new file mode 100644 index 00000000..7df79261 --- /dev/null +++ b/integration_tests/models/sql/test_deduplicate.sql @@ -0,0 +1,7 @@ +with deduped as ( + + {{ dbt_utils.deduplicate(ref('data_deduplicate'), group_by='user_id', order_by='version desc') | indent }} + +) + +select * from deduped diff --git a/macros/sql/deduplicate.sql b/macros/sql/deduplicate.sql new file mode 100644 index 00000000..f5d65534 --- /dev/null +++ b/macros/sql/deduplicate.sql @@ -0,0 +1,46 @@ +{%- macro deduplicate(relation, group_by, order_by=none) -%} + {{ return(adapter.dispatch('deduplicate', 'dbt_utils')(relation, group_by, order_by=order_by)) }} +{% endmacro %} + +{%- macro default__deduplicate(relation, group_by, order_by=none) -%} + + select + {{ dbt_utils.star(relation, relation_alias='deduped') | indent }} + from ( + select + _inner.*, + row_number() over ( + partition by {{ group_by }} + {% if order_by is not none -%} + order by {{ order_by }} + {%- endif %} + ) as rn + from {{ relation }} as _inner + ) as deduped + where deduped.rn = 1 + +{%- endmacro -%} + +{# +-- It is more performant to deduplicate using `array_agg` with a limit +-- clause in BigQuery: +-- https://github.com/dbt-labs/dbt-utils/issues/335#issuecomment-788157572 +#} +{%- macro bigquery__deduplicate(relation, group_by, order_by=none) -%} + + select + {{ dbt_utils.star(relation, relation_alias='deduped') | indent }} + from ( + select + array_agg ( + original + {% if order_by is not none -%} + order by {{ order_by }} + {%- endif %} + limit 1 + )[offset(0)] as deduped + from {{ relation }} as original + group by {{ group_by }} + ) + +{%- endmacro -%}