From 1a517d222b0fa9dd2baf314edf60794aa13ef426 Mon Sep 17 00:00:00 2001 From: Grace Goheen <53586774+graciegoheen@users.noreply.github.com> Date: Wed, 6 Apr 2022 17:39:28 -0400 Subject: [PATCH] Feature/add listagg macro (#530) * Update README.md * Mutually excl range examples in disclosure triangle * Fix union_relations error when no include/exclude provided * Fix union_relations error when no include/exclude provided (#509) * Update CHANGELOG.md * Add to_condition to relationships where * very minor nit - update "an new" to "a new" (#519) * add quoting to split_part (#528) * add quoting to split_part * update docs for split_part * typo * corrected readme syntax * revert and update to just documentation * add new line * Update README.md * Update README.md * Update README.md Co-authored-by: Joel Labes * add macro to get columns (#516) * add macro to get columns * star macro should use get_columns * add adapter. * swap adapter for dbt_utils Co-authored-by: Joel Labes * update documentation * add output_lower arg * update name to get_filtered_columns_in_relation from get_columns * add tests * forgot args * too much whitespace removal ----------- Actual: ----------- --->"field_3"as "test_field_3"<--- ----------- Expected: ----------- --->"field_3" as "test_field_3"<--- * didnt mean to move a file that i did not create. moving things back. * remove lowercase logic * limit_zero Co-authored-by: Joel Labes * Add listagg macro and integration test * remove type in listagg macro * updated integration test * Add redshift to listagg macro * remove redshift listagg * explicitly named group by column * updated default values * Updated example to use correct double vs. single quotes * whitespace control * Added redshift specific macro * Remove documentation * Update integration test so less likely to accidentally work Co-authored-by: Joel Labes * default everything but measure to none * added limit functionality for other dbs * syntax bug for postgres * update redshift macro * fixed block def control * Fixed bug in redshift * Bug fix redshift * remove unused group_by arg * Added additional test without order by col * updated to regex replace * typo * added more integration_tests * attempt to make redshift less complicated * typo * update redshift * replace to substr * More explicit versions with added complexity * handle special characters Co-authored-by: Joel Labes Co-authored-by: Jamie Rosenberg Co-authored-by: Pat Kearns --- CHANGELOG.md | 1 + README.md | 61 ++++++++-- .../data/cross_db/data_listagg.csv | 10 ++ .../data/cross_db/data_listagg_output.csv | 10 ++ .../sql/data_filtered_columns_in_relation.csv | 4 + ..._filtered_columns_in_relation_expected.csv | 2 + .../macros/assert_equal_values.sql | 32 ++++++ .../models/cross_db_utils/schema.yml | 6 + .../models/cross_db_utils/test_listagg.sql | 69 ++++++++++++ integration_tests/models/sql/schema.yml | 10 ++ .../test_get_filtered_columns_in_relation.sql | 16 +++ .../models/sql/test_star_uppercase.sql | 13 +++ macros/cross_db_utils/listagg.sql | 104 ++++++++++++++++++ .../sql/get_filtered_columns_in_relation.sql | 25 +++++ macros/sql/star.sql | 15 +-- 15 files changed, 358 insertions(+), 20 deletions(-) create mode 100644 integration_tests/data/cross_db/data_listagg.csv create mode 100644 integration_tests/data/cross_db/data_listagg_output.csv create mode 100644 integration_tests/data/sql/data_filtered_columns_in_relation.csv create mode 100644 integration_tests/data/sql/data_filtered_columns_in_relation_expected.csv create mode 100644 integration_tests/macros/assert_equal_values.sql create mode 100644 integration_tests/models/cross_db_utils/test_listagg.sql create mode 100644 integration_tests/models/sql/test_get_filtered_columns_in_relation.sql create mode 100644 integration_tests/models/sql/test_star_uppercase.sql create mode 100644 macros/cross_db_utils/listagg.sql create mode 100644 macros/sql/get_filtered_columns_in_relation.sql diff --git a/CHANGELOG.md b/CHANGELOG.md index da3399cb..8700864b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,4 @@ + # dbt-utils v0.8.3 ## New features - A macro for deduplicating data ([#335](https://github.com/dbt-labs/dbt-utils/issues/335), [#512](https://github.com/dbt-labs/dbt-utils/pull/512)) diff --git a/README.md b/README.md index 581f63ba..58ba9e23 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ For compatibility details between versions of dbt-core and dbt-utils, [see this - [Introspective macros](#introspective-macros): - [get_column_values](#get_column_values-source) + - [get_filtered_columns_in_relation](#get_filtered_columns_in_relation-source) - [get_relations_by_pattern](#get_relations_by_pattern-source) - [get_relations_by_prefix](#get_relations_by_prefix-source) - [get_query_results_as_dict](#get_query_results_as_dict-source) @@ -59,6 +60,7 @@ For compatibility details between versions of dbt-core and dbt-utils, [see this - [split_part](#split_part-source) - [last_day](#last_day-source) - [width_bucket](#width_bucket-source) + - [listagg](#listagg) - [Jinja Helpers](#jinja-helpers) - [pretty_time](#pretty_time-source) @@ -69,11 +71,11 @@ For compatibility details between versions of dbt-core and dbt-utils, [see this - [insert_by_period](#insert_by_period-source) ---- -======= ### Generic Tests #### equal_rowcount ([source](macros/generic_tests/equal_rowcount.sql)) Asserts that two relations have the same number of rows. + **Usage:** ```yaml version: 2 @@ -387,7 +389,6 @@ models: ```
Additional `gaps` and `zero_length_range_allowed` examples - **Understanding the `gaps` argument:** Here are a number of examples for each allowed `gaps` argument. @@ -435,7 +436,6 @@ models: | 0 | 1 | | 2 | 2 | | 3 | 4 | -
#### sequential_values ([source](macros/generic_tests/sequential_values.sql)) @@ -551,7 +551,7 @@ These macros run a query and return the results of the query as objects. They ar #### get_column_values ([source](macros/sql/get_column_values.sql)) This macro returns the unique values for a column in a given [relation](https://docs.getdbt.com/docs/writing-code-in-dbt/class-reference/#relation) as an array. -Arguments: +**Args:** - `table` (required): a [Relation](https://docs.getdbt.com/reference/dbt-classes#relation) (a `ref` or `source`) that contains the list of columns you wish to select from - `column` (required): The name of the column you wish to find the column values of - `order_by` (optional, default=`'count(*) desc'`): How the results should be ordered. The default is to order by `count(*) desc`, i.e. decreasing frequency. Setting this as `'my_column'` will sort alphabetically, while `'min(created_at)'` will sort by when thevalue was first observed. @@ -592,6 +592,28 @@ Arguments: ... ``` +#### get_filtered_columns_in_relation ([source](macros/sql/get_filtered_columns_in_relation.sql)) +This macro returns an iterable Jinja list of columns for a given [relation](https://docs.getdbt.com/docs/writing-code-in-dbt/class-reference/#relation), (i.e. not from a CTE) +- optionally exclude columns +- the input values are not case-sensitive (input uppercase or lowercase and it will work!) +> Note: The native [`adapter.get_columns_in_relation` macro](https://docs.getdbt.com/reference/dbt-jinja-functions/adapter#get_columns_in_relation) allows you +to pull column names in a non-filtered fashion, also bringing along with it other (potentially unwanted) information, such as dtype, char_size, numeric_precision, etc. + +**Args:** +- `from` (required): a [Relation](https://docs.getdbt.com/reference/dbt-classes#relation) (a `ref` or `source`) that contains the list of columns you wish to select from +- `except` (optional, default=`[]`): The name of the columns you wish to exclude. (case-insensitive) + +**Usage:** +```sql +-- Returns a list of the columns from a relation, so you can then iterate in a for loop +{% set column_names = dbt_utils.get_filtered_columns_in_relation(from=ref('your_model'), except=["field_1", "field_2"]) %} +... +{% for column_name in column_names %} + max({{ column_name }}) ... as max_'{{ column_name }}', +{% endfor %} +... +``` + #### get_relations_by_pattern ([source](macros/sql/get_relations_by_pattern.sql)) Returns a list of [Relations](https://docs.getdbt.com/docs/writing-code-in-dbt/class-reference/#relation) that match a given schema- or table-name pattern. @@ -770,9 +792,20 @@ group by 1,2,3 ``` #### star ([source](macros/sql/star.sql)) -This macro generates a comma-separated list of all fields that exist in the `from` relation, excluding any fields listed in the `except` argument. The construction is identical to `select * from {{ref('my_model')}}`, replacing star (`*`) with the star macro. This macro also has an optional `relation_alias` argument that will prefix all generated fields with an alias (`relation_alias`.`field_name`). +This macro generates a comma-separated list of all fields that exist in the `from` relation, excluding any fields +listed in the `except` argument. The construction is identical to `select * from {{ref('my_model')}}`, replacing star (`*`) with +the star macro. +This macro also has an optional `relation_alias` argument that will prefix all generated fields with an alias (`relation_alias`.`field_name`). +The macro also has optional `prefix` and `suffix` arguments. When one or both are provided, they will be concatenated onto each field's alias +in the output (`prefix` ~ `field_name` ~ `suffix`). NB: This prevents the output from being used in any context other than a select statement. + -The macro also has optional `prefix` and `suffix` arguments. When one or both are provided, they will be concatenated onto each field's alias in the output (`prefix` ~ `field_name` ~ `suffix`). NB: This prevents the output from being used in any context other than a select statement. +**Args:** +- `from` (required): a [Relation](https://docs.getdbt.com/reference/dbt-classes#relation) (a `ref` or `source`) that contains the list of columns you wish to select from +- `except` (optional, default=`[]`): The name of the columns you wish to exclude. (case-insensitive) +- `relation_alias` (optional, default=`''`): will prefix all generated fields with an alias (`relation_alias`.`field_name`). +- `prefix` (optional, default=`''`): will prefix the output `field_name` (`field_name as prefix_field_name`). +- `suffix` (optional, default=`''`): will suffix the output `field_name` (`field_name as field_name_suffix`). **Usage:** ```sql @@ -789,6 +822,13 @@ from {{ ref('my_model') }} ``` +```sql +select +{{ dbt_utils.star(from=ref('my_model'), except=["exclude_field_1", "exclude_field_2"], prefix="max_") }} +from {{ ref('my_model') }} + +``` + #### union_relations ([source](macros/sql/union.sql)) This macro unions together an array of [Relations](https://docs.getdbt.com/docs/writing-code-in-dbt/class-reference/#relation), @@ -987,9 +1027,16 @@ This macro calculates the difference between two dates. #### split_part ([source](macros/cross_db_utils/split_part.sql)) This macro splits a string of text using the supplied delimiter and returns the supplied part number (1-indexed). +**Args**: +- `string_text` (required): Text to be split into parts. +- `delimiter_text` (required): Text representing the delimiter to split by. +- `part_number` (required): Requested part of the split (1-based). If the value is negative, the parts are counted backward from the end of the string. + **Usage:** +When referencing a column, use one pair of quotes. When referencing a string, use single quotes enclosed in double quotes. ``` -{{ dbt_utils.split_part(string_text='1,2,3', delimiter_text=',', part_number=1) }} +{{ dbt_utils.split_part(string_text='column_to_split', delimiter_text='delimiter_column', part_number=1) }} +{{ dbt_utils.split_part(string_text="'1|2|3'", delimiter_text="'|'", part_number=1) }} ``` #### date_trunc ([source](macros/cross_db_utils/date_trunc.sql)) diff --git a/integration_tests/data/cross_db/data_listagg.csv b/integration_tests/data/cross_db/data_listagg.csv new file mode 100644 index 00000000..ee5083ba --- /dev/null +++ b/integration_tests/data/cross_db/data_listagg.csv @@ -0,0 +1,10 @@ +group_col,string_text,order_col +1,a,1 +1,b,2 +1,c,3 +2,a,2 +2,1,1 +2,p,3 +3,g,1 +3,g,2 +3,g,3 \ No newline at end of file diff --git a/integration_tests/data/cross_db/data_listagg_output.csv b/integration_tests/data/cross_db/data_listagg_output.csv new file mode 100644 index 00000000..a7e1c6c4 --- /dev/null +++ b/integration_tests/data/cross_db/data_listagg_output.csv @@ -0,0 +1,10 @@ +group_col,expected,version +1,"a_|_b_|_c",bottom_ordered +2,"1_|_a_|_p",bottom_ordered +3,"g_|_g_|_g",bottom_ordered +1,"a_|_b",bottom_ordered_limited +2,"1_|_a",bottom_ordered_limited +3,"g_|_g",bottom_ordered_limited +3,"g, g, g",comma_whitespace_unordered +3,"g",distinct_comma +3,"g,g,g",no_params \ No newline at end of file diff --git a/integration_tests/data/sql/data_filtered_columns_in_relation.csv b/integration_tests/data/sql/data_filtered_columns_in_relation.csv new file mode 100644 index 00000000..9d96143b --- /dev/null +++ b/integration_tests/data/sql/data_filtered_columns_in_relation.csv @@ -0,0 +1,4 @@ +field_1,field_2,field_3 +a,b,c +d,e,f +g,h,i \ No newline at end of file diff --git a/integration_tests/data/sql/data_filtered_columns_in_relation_expected.csv b/integration_tests/data/sql/data_filtered_columns_in_relation_expected.csv new file mode 100644 index 00000000..77ea0563 --- /dev/null +++ b/integration_tests/data/sql/data_filtered_columns_in_relation_expected.csv @@ -0,0 +1,2 @@ +field_2,field_3 +h,i \ No newline at end of file diff --git a/integration_tests/macros/assert_equal_values.sql b/integration_tests/macros/assert_equal_values.sql new file mode 100644 index 00000000..d4f02618 --- /dev/null +++ b/integration_tests/macros/assert_equal_values.sql @@ -0,0 +1,32 @@ +{% macro assert_equal_values(actual_object, expected_object) %} +{% if not execute %} + + {# pass #} + +{% elif actual_object != expected_object %} + + {% set msg %} + Expected did not match actual + + ----------- + Actual: + ----------- + --->{{ actual_object }}<--- + + ----------- + Expected: + ----------- + --->{{ expected_object }}<--- + + {% endset %} + + {{ log(msg, info=True) }} + + select 'fail' + +{% else %} + + select 'ok' {{ limit_zero() }} + +{% endif %} +{% endmacro %} \ No newline at end of file diff --git a/integration_tests/models/cross_db_utils/schema.yml b/integration_tests/models/cross_db_utils/schema.yml index dbe7a8f4..e1473c9f 100644 --- a/integration_tests/models/cross_db_utils/schema.yml +++ b/integration_tests/models/cross_db_utils/schema.yml @@ -58,6 +58,12 @@ models: - assert_equal: actual: actual expected: expected + + - name: test_listagg + tests: + - assert_equal: + actual: actual + expected: expected - name: test_safe_cast tests: diff --git a/integration_tests/models/cross_db_utils/test_listagg.sql b/integration_tests/models/cross_db_utils/test_listagg.sql new file mode 100644 index 00000000..006948de --- /dev/null +++ b/integration_tests/models/cross_db_utils/test_listagg.sql @@ -0,0 +1,69 @@ +with data as ( + + select * from {{ ref('data_listagg') }} + +), + +data_output as ( + + select * from {{ ref('data_listagg_output') }} + +), + +calculate as ( + + select + group_col, + {{ dbt_utils.listagg('string_text', "'_|_'", "order by order_col") }} as actual, + 'bottom_ordered' as version + from data + group by group_col + + union all + + select + group_col, + {{ dbt_utils.listagg('string_text', "'_|_'", "order by order_col", 2) }} as actual, + 'bottom_ordered_limited' as version + from data + group by group_col + + union all + + select + group_col, + {{ dbt_utils.listagg('string_text', "', '") }} as actual, + 'comma_whitespace_unordered' as version + from data + where group_col = 3 + group by group_col + + union all + + select + group_col, + {{ dbt_utils.listagg('DISTINCT string_text', "','") }} as actual, + 'distinct_comma' as version + from data + where group_col = 3 + group by group_col + + union all + + select + group_col, + {{ dbt_utils.listagg('string_text') }} as actual, + 'no_params' as version + from data + where group_col = 3 + group by group_col + +) + +select + calculate.actual, + data_output.expected +from calculate +left join data_output +on calculate.group_col = data_output.group_col +and calculate.version = data_output.version \ No newline at end of file diff --git a/integration_tests/models/sql/schema.yml b/integration_tests/models/sql/schema.yml index ad990260..a78e5e1b 100644 --- a/integration_tests/models/sql/schema.yml +++ b/integration_tests/models/sql/schema.yml @@ -50,6 +50,11 @@ models: values: - '5' + - name: test_get_filtered_columns_in_relation + tests: + - dbt_utils.equality: + compare_model: ref('data_filtered_columns_in_relation_expected') + - name: test_get_relations_by_prefix_and_union columns: - name: event @@ -121,6 +126,11 @@ models: - dbt_utils.equality: compare_model: ref('data_star_aggregate_expected') + - name: test_star_uppercase + tests: + - dbt_utils.equality: + compare_model: ref('data_star_expected') + - name: test_surrogate_key tests: - assert_equal: diff --git a/integration_tests/models/sql/test_get_filtered_columns_in_relation.sql b/integration_tests/models/sql/test_get_filtered_columns_in_relation.sql new file mode 100644 index 00000000..7b3ca72f --- /dev/null +++ b/integration_tests/models/sql/test_get_filtered_columns_in_relation.sql @@ -0,0 +1,16 @@ +{% set exclude_field = 'field_1' %} +{% set column_names = dbt_utils.get_filtered_columns_in_relation(from= ref('data_filtered_columns_in_relation'), except=[exclude_field]) %} + +with data as ( + + select + + {% for column_name in column_names %} + max({{ column_name }}) as {{ column_name }} {% if not loop.last %},{% endif %} + {% endfor %} + + from {{ ref('data_filtered_columns_in_relation') }} + +) + +select * from data diff --git a/integration_tests/models/sql/test_star_uppercase.sql b/integration_tests/models/sql/test_star_uppercase.sql new file mode 100644 index 00000000..6179e691 --- /dev/null +++ b/integration_tests/models/sql/test_star_uppercase.sql @@ -0,0 +1,13 @@ +{% set exclude_field = 'FIELD_3' %} + + +with data as ( + + select + {{ dbt_utils.star(from=ref('data_star'), except=[exclude_field]) }} + + from {{ ref('data_star') }} + +) + +select * from data diff --git a/macros/cross_db_utils/listagg.sql b/macros/cross_db_utils/listagg.sql new file mode 100644 index 00000000..1d19a54f --- /dev/null +++ b/macros/cross_db_utils/listagg.sql @@ -0,0 +1,104 @@ +{% macro listagg(measure, delimiter_text="','", order_by_clause=none, limit_num=none) -%} + {{ return(adapter.dispatch('listagg', 'dbt_utils') (measure, delimiter_text, order_by_clause, limit_num)) }} +{%- endmacro %} + +{% macro default__listagg(measure, delimiter_text, order_by_clause, limit_num) -%} + + {% if limit_num -%} + array_to_string( + array_slice( + array_agg( + {{ measure }} + ){% if order_by_clause -%} + within group ({{ order_by_clause }}) + {%- endif %} + ,0 + ,{{ limit_num }} + ), + {{ delimiter_text }} + ) + {%- else %} + listagg( + {{ measure }}, + {{ delimiter_text }} + ) + {% if order_by_clause -%} + within group ({{ order_by_clause }}) + {%- endif %} + {%- endif %} + +{%- endmacro %} + +{% macro bigquery__listagg(measure, delimiter_text, order_by_clause, limit_num) -%} + + string_agg( + {{ measure }}, + {{ delimiter_text }} + {% if order_by_clause -%} + {{ order_by_clause }} + {%- endif %} + {% if limit_num -%} + limit {{ limit_num }} + {%- endif %} + ) + +{%- endmacro %} + +{% macro postgres__listagg(measure, delimiter_text, order_by_clause, limit_num) -%} + + {% if limit_num -%} + array_to_string( + (array_agg( + {{ measure }} + {% if order_by_clause -%} + {{ order_by_clause }} + {%- endif %} + ))[1:{{ limit_num }}], + {{ delimiter_text }} + ) + {%- else %} + string_agg( + {{ measure }}, + {{ delimiter_text }} + {% if order_by_clause -%} + {{ order_by_clause }} + {%- endif %} + ) + {%- endif %} + +{%- endmacro %} + +{# if there are instances of delimiter_text within your measure, you cannot include a limit_num #} +{% macro redshift__listagg(measure, delimiter_text, order_by_clause, limit_num) -%} + + {% if limit_num -%} + {% set ns = namespace() %} + {% set ns.delimiter_text_regex = delimiter_text|trim("'") %} + {% set special_chars %}\,^,$,.,|,?,*,+,(,),[,],{,}{% endset %} + {%- for char in special_chars.split(',') -%} + {% set escape_char %}\\{{ char }}{% endset %} + {% set ns.delimiter_text_regex = ns.delimiter_text_regex|replace(char,escape_char) %} + {%- endfor -%} + + {% set regex %}'([^{{ ns.delimiter_text_regex }}]+{{ ns.delimiter_text_regex }}){1,{{ limit_num - 1}}}[^{{ ns.delimiter_text_regex }}]+'{% endset %} + regexp_substr( + listagg( + {{ measure }}, + {{ delimiter_text }} + ) + {% if order_by_clause -%} + within group ({{ order_by_clause }}) + {%- endif %} + ,{{ regex }} + ) + {%- else %} + listagg( + {{ measure }}, + {{ delimiter_text }} + ) + {% if order_by_clause -%} + within group ({{ order_by_clause }}) + {%- endif %} + {%- endif %} + +{%- endmacro %} \ No newline at end of file diff --git a/macros/sql/get_filtered_columns_in_relation.sql b/macros/sql/get_filtered_columns_in_relation.sql new file mode 100644 index 00000000..7f4af889 --- /dev/null +++ b/macros/sql/get_filtered_columns_in_relation.sql @@ -0,0 +1,25 @@ +{% macro get_filtered_columns_in_relation(from, except=[]) -%} + {{ return(adapter.dispatch('get_filtered_columns_in_relation', 'dbt_utils')(from, except)) }} +{% endmacro %} + +{% macro default__get_filtered_columns_in_relation(from, except=[]) -%} + {%- do dbt_utils._is_relation(from, 'get_filtered_columns_in_relation') -%} + {%- do dbt_utils._is_ephemeral(from, 'get_filtered_columns_in_relation') -%} + + {# -- Prevent querying of db in parsing mode. This works because this macro does not create any new refs. #} + {%- if not execute -%} + {{ return('') }} + {% endif %} + + {%- set include_cols = [] %} + {%- set cols = adapter.get_columns_in_relation(from) -%} + {%- set except = except | map("lower") | list %} + {%- for col in cols -%} + {%- if col.column|lower not in except -%} + {% do include_cols.append(col.column) %} + {%- endif %} + {%- endfor %} + + {{ return(include_cols) }} + +{%- endmacro %} \ No newline at end of file diff --git a/macros/sql/star.sql b/macros/sql/star.sql index 0bfa8c9a..72778b1f 100644 --- a/macros/sql/star.sql +++ b/macros/sql/star.sql @@ -11,20 +11,9 @@ {{ return('') }} {% endif %} - {%- set include_cols = [] %} - {%- set cols = adapter.get_columns_in_relation(from) -%} - {%- set except = except | map("lower") | list %} - {%- for col in cols -%} + {%- for col in dbt_utils.get_filtered_columns_in_relation(from, except) %} - {%- if col.column|lower not in except -%} - {% do include_cols.append(col.column) %} - - {%- endif %} - {%- endfor %} - - {%- for col in include_cols %} - - {%- if relation_alias %}{{ relation_alias }}.{% else %}{%- endif -%}{{ adapter.quote(col)|trim }} {%- if prefix!='' or suffix!='' -%} as {{ adapter.quote(prefix ~ col ~ suffix)|trim }} {%- endif -%} + {%- if relation_alias %}{{ relation_alias }}.{% else %}{%- endif -%}{{ adapter.quote(col)|trim }} {%- if prefix!='' or suffix!='' %} as {{ adapter.quote(prefix ~ col ~ suffix)|trim }} {%- endif -%} {%- if not loop.last %},{{ '\n ' }}{% endif %} {%- endfor -%}