From a7f4f51fe1d9883150f01f623e83f1ab197efcbb Mon Sep 17 00:00:00 2001
From: Joel Labes <joel.labes@dbtlabs.com>
Date: Wed, 23 Feb 2022 15:53:49 +1300
Subject: [PATCH 1/8] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index bab3a503..f22c3809 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ For compatibility details between versions of dbt-core and dbt-utils, [see this
 [Materializations](#materializations):
 - [insert_by_period](#insert_by_period-source)
 
----
+----
 ### Schema Tests
 #### equal_rowcount ([source](macros/schema_tests/equal_rowcount.sql))
 This schema test asserts the that two relations have the same number of rows.

From 168396fb2babd889ed2acba6fb8242563fd4bf27 Mon Sep 17 00:00:00 2001
From: Joel Labes <joel.labes@dbtlabs.com>
Date: Mon, 28 Feb 2022 15:51:08 +1300
Subject: [PATCH 2/8] Mutually excl range examples in disclosure triangle

---
 README.md | 99 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 52 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index f22c3809..709640b3 100644
--- a/README.md
+++ b/README.md
@@ -377,53 +377,58 @@ models:
         partition_by: customer_id
         gaps: allowed
 ```
-
-**Understanding the `gaps` argument:**
-Here are a number of examples for each allowed `gaps` argument.
-* `gaps: not_allowed`: The upper bound of one record must be the lower bound of
-the next record.
-
-| lower_bound | upper_bound |
-|-------------|-------------|
-| 0           | 1           |
-| 1           | 2           |
-| 2           | 3           |
-
-* `gaps: allowed` (default): There may be a gap between the upper bound of one
-record and the lower bound of the next record.
-
-| lower_bound | upper_bound |
-|-------------|-------------|
-| 0           | 1           |
-| 2           | 3           |
-| 3           | 4           |
-
-* `gaps: required`: There must be a gap between the upper bound of one record and
-the lower bound of the next record (common for date ranges).
-
-| lower_bound | upper_bound |
-|-------------|-------------|
-| 0           | 1           |
-| 2           | 3           |
-| 4           | 5           |
-
-**Understanding the `zero_length_range_allowed` argument:**
-Here are a number of examples for each allowed `zero_length_range_allowed` argument.
-* `zero_length_range_allowed: false`: (default) The upper bound of each record must be greater than its lower bound.
-
-| lower_bound | upper_bound |
-|-------------|-------------|
-| 0           | 1           |
-| 1           | 2           |
-| 2           | 3           |
-
-* `zero_length_range_allowed: true`: The upper bound of each record can be greater than or equal to its lower bound.
-
-| lower_bound | upper_bound |
-|-------------|-------------|
-| 0           | 1           |
-| 2           | 2           |
-| 3           | 4           |
+<details>
+<summary>Additional `gaps` and `zero_length_range_allowed` examples</summary>
+
+  **Understanding the `gaps` argument:**
+
+  Here are a number of examples for each allowed `gaps` argument.
+  * `gaps: not_allowed`: The upper bound of one record must be the lower bound of
+  the next record.
+
+  | lower_bound | upper_bound |
+  |-------------|-------------|
+  | 0           | 1           |
+  | 1           | 2           |
+  | 2           | 3           |
+
+  * `gaps: allowed` (default): There may be a gap between the upper bound of one
+  record and the lower bound of the next record.
+
+  | lower_bound | upper_bound |
+  |-------------|-------------|
+  | 0           | 1           |
+  | 2           | 3           |
+  | 3           | 4           |
+
+  * `gaps: required`: There must be a gap between the upper bound of one record and
+  the lower bound of the next record (common for date ranges).
+
+  | lower_bound | upper_bound |
+  |-------------|-------------|
+  | 0           | 1           |
+  | 2           | 3           |
+  | 4           | 5           |
+
+  **Understanding the `zero_length_range_allowed` argument:**
+  Here are a number of examples for each allowed `zero_length_range_allowed` argument.
+  * `zero_length_range_allowed: false`: (default) The upper bound of each record must be greater than its lower bound.
+
+  | lower_bound | upper_bound |
+  |-------------|-------------|
+  | 0           | 1           |
+  | 1           | 2           |
+  | 2           | 3           |
+
+  * `zero_length_range_allowed: true`: The upper bound of each record can be greater than or equal to its lower bound.
+
+  | lower_bound | upper_bound |
+  |-------------|-------------|
+  | 0           | 1           |
+  | 2           | 2           |
+  | 3           | 4           |
+
+</details>
 
 #### sequential_values ([source](macros/schema_tests/sequential_values.sql))
 This test confirms that a column contains sequential values. It can be used

From 5717b109d358844e0f085565b93e74cbe469ecb0 Mon Sep 17 00:00:00 2001
From: Joel Labes <joel.labes@dbtlabs.com>
Date: Thu, 3 Mar 2022 14:24:19 +1300
Subject: [PATCH 3/8] Fix union_relations error when no include/exclude
 provided

* Fix union_relations error when no include/exclude provided (#509)

* Update CHANGELOG.md
---
 CHANGELOG.md         | 4 ++++
 macros/sql/union.sql | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 67b690e7..f346c55b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# dbt-utils v0.8.2
+## Fixes
+- Fix union_relations error from [#473](https://github.com/dbt-labs/dbt-utils/pull/473) when no include/exclude parameters are provided ([#505](https://github.com/dbt-labs/dbt-utils/issues/505), [#509](https://github.com/dbt-labs/dbt-utils/pull/509))
+
 # dbt-utils v0.8.1
 
 ## New features
diff --git a/macros/sql/union.sql b/macros/sql/union.sql
index 009a765a..a7bf1d95 100644
--- a/macros/sql/union.sql
+++ b/macros/sql/union.sql
@@ -61,7 +61,7 @@
 
     {%- set ordered_column_names = column_superset.keys() -%}
 
-    {%- if not column_superset.keys() -%}
+    {% if (include | length > 0 or exclude | length > 0) and not column_superset.keys() %}
         {%- set relations_string -%}
             {%- for relation in relations -%}
                 {{ relation.name }}

From ea38b6b3e66c6050f642c2d5228ee68bd19120e2 Mon Sep 17 00:00:00 2001
From: Judah Rand <17158624+judahrand@users.noreply.github.com>
Date: Sun, 6 Mar 2022 15:55:16 +0000
Subject: [PATCH 4/8] Add dedupe macro

---
 macros/sql/dedupe.sql | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 macros/sql/dedupe.sql

diff --git a/macros/sql/dedupe.sql b/macros/sql/dedupe.sql
new file mode 100644
index 00000000..0d875db1
--- /dev/null
+++ b/macros/sql/dedupe.sql
@@ -0,0 +1,41 @@
+{%- macro dedupe(table, group_by, order_by=none) -%}
+    {{ return(adapter.dispatch('dedupe', 'dbt_utils')(table, group_by, order_by=order_by)) }}
+{% endmacro %}
+
+{%- macro default__dedupe(table, group_by, order_by=none) -%}
+
+    select
+        {{ dbt_utils.star(table, relation_alias='deduped') | indent }}
+    from (
+        select
+            _inner.*,
+            row_number() over (
+                partition by {{ group_by }}
+                {% if order_by != none -%}
+                order by {{ order_by }}
+                {%- endif %}
+            ) as rn
+        from {{ table }} as _inner
+    ) as deduped
+    where deduped.rn = 1
+
+{%- endmacro -%}
+
+{%- macro bigquery__dedupe(table, group_by, order_by=none) -%}
+
+    select
+        {{ dbt_utils.star(table, relation_alias='deduped') | indent }}
+    from (
+        select
+            array_agg (
+                original
+                {% if order_by != none -%}
+                order by {{ order_by }}
+                {%- endif %}
+                limit 1
+            )[offset(0)] as deduped
+        from {{ table }} as original
+        group by {{ group_by }}
+    )
+
+{%- endmacro -%}

From 821c1d0f69c38c61aa14111416076237f9f0fc31 Mon Sep 17 00:00:00 2001
From: Judah Rand <17158624+judahrand@users.noreply.github.com>
Date: Sun, 6 Mar 2022 16:07:44 +0000
Subject: [PATCH 5/8] Add test for dedupe macro

---
 integration_tests/data/sql/data_dedupe.csv          | 3 +++
 integration_tests/data/sql/data_dedupe_expected.csv | 2 ++
 integration_tests/models/sql/schema.yml             | 9 +++++++--
 integration_tests/models/sql/test_dedupe.sql        | 7 +++++++
 4 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 integration_tests/data/sql/data_dedupe.csv
 create mode 100644 integration_tests/data/sql/data_dedupe_expected.csv
 create mode 100644 integration_tests/models/sql/test_dedupe.sql

diff --git a/integration_tests/data/sql/data_dedupe.csv b/integration_tests/data/sql/data_dedupe.csv
new file mode 100644
index 00000000..c3ae0c4d
--- /dev/null
+++ b/integration_tests/data/sql/data_dedupe.csv
@@ -0,0 +1,3 @@
+user_id,event,version
+1,play,1
+1,play,2
diff --git a/integration_tests/data/sql/data_dedupe_expected.csv b/integration_tests/data/sql/data_dedupe_expected.csv
new file mode 100644
index 00000000..de5e204d
--- /dev/null
+++ b/integration_tests/data/sql/data_dedupe_expected.csv
@@ -0,0 +1,2 @@
+user_id,event,version
+1,play,2
diff --git a/integration_tests/models/sql/schema.yml b/integration_tests/models/sql/schema.yml
index 616dd386..6fbe4866 100644
--- a/integration_tests/models/sql/schema.yml
+++ b/integration_tests/models/sql/schema.yml
@@ -85,7 +85,7 @@ models:
     tests:
       - dbt_utils.equality:
           compare_model: ref('data_pivot_expected')
-  
+
   - name: test_pivot_apostrophe
     tests:
       - dbt_utils.equality:
@@ -137,8 +137,13 @@ models:
     tests:
       - dbt_utils.equality:
           compare_model: ref('data_union_expected')
-          
+
   - name: test_get_relations_by_pattern
     tests:
       - dbt_utils.equality:
           compare_model: ref('data_union_events_expected')
+
+  - name: test_dedupe
+    tests:
+      - dbt_utils.equality:
+          compare_model: ref('data_dedupe_expected')
diff --git a/integration_tests/models/sql/test_dedupe.sql b/integration_tests/models/sql/test_dedupe.sql
new file mode 100644
index 00000000..1d078002
--- /dev/null
+++ b/integration_tests/models/sql/test_dedupe.sql
@@ -0,0 +1,7 @@
+WITH deduped as (
+
+    {{ dbt_utils.dedupe(ref('data_dedupe'), group_by='user_id', order_by='version desc') | indent }}
+
+)
+
+select * from deduped

From f88d49b484262dd3e9baaceca3e0cb5b76c1333a Mon Sep 17 00:00:00 2001
From: Judah Rand <17158624+judahrand@users.noreply.github.com>
Date: Sun, 6 Mar 2022 17:58:41 +0000
Subject: [PATCH 6/8] Add documentation to README

---
 README.md | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 709640b3..10e615ef 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@ For compatibility details between versions of dbt-core and dbt-utils, [see this
 
 - [SQL generators](#sql-generators)
     - [date_spine](#date_spine-source)
+    - [dedupe](#dedupe-source)
     - [haversine_distance](#haversine_distance-source)
     - [group_by](#group_by-source)
     - [star](#star-source)
@@ -705,6 +706,20 @@ This macro returns the sql required to build a date spine. The spine will includ
 }}
 ```
 
+#### dedupe ([source](macros/sql/dedupe.sql))
+This macro returns the sql required to remove deduplicate rows from a model or source.
+
+**Usage:**
+
+```
+{{ dbt_utils.dedupe(
+    source('my_source', 'my_table'),
+    "user_id, cast(timestamp as day)",
+    order_by="timestamp desc"
+   )
+}}
+```
+
 #### haversine_distance ([source](macros/sql/haversine_distance.sql))
 This macro calculates the [haversine distance](http://daynebatten.com/2015/09/latitude-longitude-distance-sql/) between a pair of x/y coordinates.
 
@@ -747,7 +762,7 @@ group by 1,2,3
 ```
 
 #### star ([source](macros/sql/star.sql))
-This macro generates a comma-separated list of all fields that exist in the `from` relation, excluding any fields listed in the `except` argument. The construction is identical to `select * from {{ref('my_model')}}`, replacing star (`*`) with the star macro. This macro also has an optional `relation_alias` argument that will prefix all generated fields with an alias (`relation_alias`.`field_name`). 
+This macro generates a comma-separated list of all fields that exist in the `from` relation, excluding any fields listed in the `except` argument. The construction is identical to `select * from {{ref('my_model')}}`, replacing star (`*`) with the star macro. This macro also has an optional `relation_alias` argument that will prefix all generated fields with an alias (`relation_alias`.`field_name`).
 
 The macro also has optional `prefix` and `suffix` arguments. When one or both are provided, they will be concatenated onto each field's alias in the output (`prefix` ~ `field_name` ~ `suffix`). NB: This prevents the output from being used in any context other than a select statement.
 

From e85023d64e8402fd968586247406b4b7c184ab6a Mon Sep 17 00:00:00 2001
From: Judah Rand <17158624+judahrand@users.noreply.github.com>
Date: Sun, 6 Mar 2022 18:03:41 +0000
Subject: [PATCH 7/8] Add entry to CHANGELOG

---
 CHANGELOG.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f346c55b..c9046d7f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# dbt-utils v0.8.3
+## New features
+- A macro for deduplicating data ([#335](https://github.com/dbt-labs/dbt-utils/issues/335), [#512](https://github.com/dbt-labs/dbt-utils/pull/512))
+
 # dbt-utils v0.8.2
 ## Fixes
 - Fix union_relations error from [#473](https://github.com/dbt-labs/dbt-utils/pull/473) when no include/exclude parameters are provided ([#505](https://github.com/dbt-labs/dbt-utils/issues/505), [#509](https://github.com/dbt-labs/dbt-utils/pull/509))
@@ -32,7 +36,7 @@
 
 # dbt-utils v0.8.0
 ## 🚨 Breaking changes
-- dbt ONE POINT OH is here! This version of dbt-utils requires _any_ version (minor and patch) of v1, which means far less need for compatibility releases in the future. 
+- dbt ONE POINT OH is here! This version of dbt-utils requires _any_ version (minor and patch) of v1, which means far less need for compatibility releases in the future.
 - The partition column in the `mutually_exclusive_ranges` test is now always called `partition_by_col`. This enables compatibility with `--store-failures` when multiple columns are concatenated together. If you have models built on top of the failures table, update them to reflect the new column name. ([#423](https://github.com/dbt-labs/dbt-utils/issues/423), [#430](https://github.com/dbt-labs/dbt-utils/pull/430))
 
 ## Contributors:

From ecaa396cfa96aef62fb52fb6855997bb73f829db Mon Sep 17 00:00:00 2001
From: Judah Rand <17158624+judahrand@users.noreply.github.com>
Date: Mon, 7 Mar 2022 08:57:14 +0000
Subject: [PATCH 8/8] Implement review

---
 README.md                                     | 10 ++--
 .../{data_dedupe.csv => data_deduplicate.csv} |  0
 ...cted.csv => data_deduplicate_expected.csv} |  0
 integration_tests/models/sql/schema.yml       |  2 +-
 integration_tests/models/sql/test_dedupe.sql  |  7 ---
 .../models/sql/test_deduplicate.sql           |  7 +++
 macros/sql/dedupe.sql                         | 41 -----------------
 macros/sql/deduplicate.sql                    | 46 +++++++++++++++++++
 8 files changed, 59 insertions(+), 54 deletions(-)
 rename integration_tests/data/sql/{data_dedupe.csv => data_deduplicate.csv} (100%)
 rename integration_tests/data/sql/{data_dedupe_expected.csv => data_deduplicate_expected.csv} (100%)
 delete mode 100644 integration_tests/models/sql/test_dedupe.sql
 create mode 100644 integration_tests/models/sql/test_deduplicate.sql
 delete mode 100644 macros/sql/dedupe.sql
 create mode 100644 macros/sql/deduplicate.sql

diff --git a/README.md b/README.md
index 10e615ef..adf0bbc3 100644
--- a/README.md
+++ b/README.md
@@ -706,15 +706,15 @@ This macro returns the sql required to build a date spine. The spine will includ
 }}
 ```
 
-#### dedupe ([source](macros/sql/dedupe.sql))
-This macro returns the sql required to remove deduplicate rows from a model or source.
+#### deduplicate ([source](macros/sql/deduplicate.sql))
+This macro returns the sql required to remove duplicate rows from a model or source.
 
 **Usage:**
 
 ```
-{{ dbt_utils.dedupe(
-    source('my_source', 'my_table'),
-    "user_id, cast(timestamp as day)",
+{{ dbt_utils.deduplicate(
+    relation=source('my_source', 'my_table'),
+    group_by="user_id, cast(timestamp as day)",
     order_by="timestamp desc"
    )
 }}
diff --git a/integration_tests/data/sql/data_dedupe.csv b/integration_tests/data/sql/data_deduplicate.csv
similarity index 100%
rename from integration_tests/data/sql/data_dedupe.csv
rename to integration_tests/data/sql/data_deduplicate.csv
diff --git a/integration_tests/data/sql/data_dedupe_expected.csv b/integration_tests/data/sql/data_deduplicate_expected.csv
similarity index 100%
rename from integration_tests/data/sql/data_dedupe_expected.csv
rename to integration_tests/data/sql/data_deduplicate_expected.csv
diff --git a/integration_tests/models/sql/schema.yml b/integration_tests/models/sql/schema.yml
index 6fbe4866..ad990260 100644
--- a/integration_tests/models/sql/schema.yml
+++ b/integration_tests/models/sql/schema.yml
@@ -146,4 +146,4 @@ models:
   - name: test_dedupe
     tests:
       - dbt_utils.equality:
-          compare_model: ref('data_dedupe_expected')
+          compare_model: ref('data_deduplicate_expected')
diff --git a/integration_tests/models/sql/test_dedupe.sql b/integration_tests/models/sql/test_dedupe.sql
deleted file mode 100644
index 1d078002..00000000
--- a/integration_tests/models/sql/test_dedupe.sql
+++ /dev/null
@@ -1,7 +0,0 @@
-WITH deduped as (
-
-    {{ dbt_utils.dedupe(ref('data_dedupe'), group_by='user_id', order_by='version desc') | indent }}
-
-)
-
-select * from deduped
diff --git a/integration_tests/models/sql/test_deduplicate.sql b/integration_tests/models/sql/test_deduplicate.sql
new file mode 100644
index 00000000..7df79261
--- /dev/null
+++ b/integration_tests/models/sql/test_deduplicate.sql
@@ -0,0 +1,7 @@
+with deduped as (
+
+    {{ dbt_utils.deduplicate(ref('data_deduplicate'), group_by='user_id', order_by='version desc') | indent }}
+
+)
+
+select * from deduped
diff --git a/macros/sql/dedupe.sql b/macros/sql/dedupe.sql
deleted file mode 100644
index 0d875db1..00000000
--- a/macros/sql/dedupe.sql
+++ /dev/null
@@ -1,41 +0,0 @@
-{%- macro dedupe(table, group_by, order_by=none) -%}
-    {{ return(adapter.dispatch('dedupe', 'dbt_utils')(table, group_by, order_by=order_by)) }}
-{% endmacro %}
-
-{%- macro default__dedupe(table, group_by, order_by=none) -%}
-
-    select
-        {{ dbt_utils.star(table, relation_alias='deduped') | indent }}
-    from (
-        select
-            _inner.*,
-            row_number() over (
-                partition by {{ group_by }}
-                {% if order_by != none -%}
-                order by {{ order_by }}
-                {%- endif %}
-            ) as rn
-        from {{ table }} as _inner
-    ) as deduped
-    where deduped.rn = 1
-
-{%- endmacro -%}
-
-{%- macro bigquery__dedupe(table, group_by, order_by=none) -%}
-
-    select
-        {{ dbt_utils.star(table, relation_alias='deduped') | indent }}
-    from (
-        select
-            array_agg (
-                original
-                {% if order_by != none -%}
-                order by {{ order_by }}
-                {%- endif %}
-                limit 1
-            )[offset(0)] as deduped
-        from {{ table }} as original
-        group by {{ group_by }}
-    )
-
-{%- endmacro -%}
diff --git a/macros/sql/deduplicate.sql b/macros/sql/deduplicate.sql
new file mode 100644
index 00000000..f5d65534
--- /dev/null
+++ b/macros/sql/deduplicate.sql
@@ -0,0 +1,46 @@
+{%- macro deduplicate(relation, group_by, order_by=none) -%}
+    {{ return(adapter.dispatch('deduplicate', 'dbt_utils')(relation, group_by, order_by=order_by)) }}
+{% endmacro %}
+
+{%- macro default__deduplicate(relation, group_by, order_by=none) -%}
+
+    select
+        {{ dbt_utils.star(relation, relation_alias='deduped') | indent }}
+    from (
+        select
+            _inner.*,
+            row_number() over (
+                partition by {{ group_by }}
+                {% if order_by is not none -%}
+                order by {{ order_by }}
+                {%- endif %}
+            ) as rn
+        from {{ relation }} as _inner
+    ) as deduped
+    where deduped.rn = 1
+
+{%- endmacro -%}
+
+{#
+--  It is more performant to deduplicate using `array_agg` with a limit
+--  clause in BigQuery:
+--  https://github.com/dbt-labs/dbt-utils/issues/335#issuecomment-788157572
+#}
+{%- macro bigquery__deduplicate(relation, group_by, order_by=none) -%}
+
+    select
+        {{ dbt_utils.star(relation, relation_alias='deduped') | indent }}
+    from (
+        select
+            array_agg (
+                original
+                {% if order_by is not none -%}
+                order by {{ order_by }}
+                {%- endif %}
+                limit 1
+            )[offset(0)] as deduped
+        from {{ relation }} as original
+        group by {{ group_by }}
+    )
+
+{%- endmacro -%}