Skip to content

Commit

Permalink
Feature/performance enhancement (#127)
Browse files Browse the repository at this point in the history
* feature/performance-enhancement
  • Loading branch information
fivetran-catfritz authored Apr 30, 2024
1 parent ec9fec4 commit 251785a
Show file tree
Hide file tree
Showing 27 changed files with 329 additions and 369 deletions.
4 changes: 3 additions & 1 deletion .buildkite/hooks/pre-command
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ export CI_SNOWFLAKE_DBT_WAREHOUSE=$(gcloud secrets versions access latest --secr
export CI_DATABRICKS_DBT_HOST=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_HOST" --project="dbt-package-testing-363917")
export CI_DATABRICKS_DBT_HTTP_PATH=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_HTTP_PATH" --project="dbt-package-testing-363917")
export CI_DATABRICKS_DBT_TOKEN=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_TOKEN" --project="dbt-package-testing-363917")
export CI_DATABRICKS_DBT_CATALOG=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_CATALOG" --project="dbt-package-testing-363917")
export CI_DATABRICKS_DBT_CATALOG=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_CATALOG" --project="dbt-package-testing-363917")
export CI_DATABRICKS_SQL_DBT_HTTP_PATH=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_SQL_DBT_HTTP_PATH" --project="dbt-package-testing-363917")
export CI_DATABRICKS_SQL_DBT_TOKEN=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_SQL_DBT_TOKEN" --project="dbt-package-testing-363917")
15 changes: 15 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,18 @@ steps:
- "CI_DATABRICKS_DBT_CATALOG"
commands: |
bash .buildkite/scripts/run_models.sh databricks
- label: ":databricks: :database: Run Tests - Databricks SQL Warehouse"
key: "run_dbt_databricks_sql"
plugins:
- docker#v3.13.0:
image: "python:3.8"
shell: [ "/bin/bash", "-e", "-c" ]
environment:
- "BASH_ENV=/tmp/.bashrc"
- "CI_DATABRICKS_DBT_HOST"
- "CI_DATABRICKS_SQL_DBT_HTTP_PATH"
- "CI_DATABRICKS_SQL_DBT_TOKEN"
- "CI_DATABRICKS_DBT_CATALOG"
commands: |
bash .buildkite/scripts/run_models.sh databricks-sql
16 changes: 16 additions & 0 deletions .buildkite/scripts/run_models.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,26 @@ db=$1
echo `pwd`
cd integration_tests
dbt deps

if [ "$db" = "databricks-sql" ]; then
dbt seed --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db" --full-refresh
dbt compile --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db"
dbt run --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db" --full-refresh
dbt run --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db"
dbt test --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db"
dbt run --vars '{jira_schema: jira_integrations_tests_sqlw, jira_using_priorities: false, jira_using_sprints: false, jira_using_components: false, jira_using_versions: false, jira_field_grain: 'field_name'}' --target "$db" --full-refresh
dbt run --vars '{jira_schema: jira_integrations_tests_sqlw, jira_using_priorities: false, jira_using_sprints: false, jira_using_components: false, jira_using_versions: false, jira_field_grain: 'field_name'}' --target "$db"
dbt test --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db"

else
dbt seed --target "$db" --full-refresh
dbt compile --target "$db"
dbt run --target "$db" --full-refresh
dbt run --target "$db"
dbt test --target "$db"
dbt run --vars "{jira_using_priorities: false, jira_using_sprints: false, jira_using_components: false, jira_using_versions: false, jira_field_grain: 'field_name'}" --target "$db" --full-refresh
dbt run --vars "{jira_using_priorities: false, jira_using_sprints: false, jira_using_components: false, jira_using_versions: false, jira_field_grain: 'field_name'}" --target "$db"
dbt test --target "$db"
fi

dbt run-operation fivetran_utils.drop_schemas_automation --target "$db"
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ Please acknowledge that you have successfully performed the following commands l
- [ ] dbt run (if incremental models are present) && dbt test

Before marking this PR as "ready for review" the following have been applied:
- [ ] The appropriate issue has been linked, tagged, and properly assigned
- [ ] All necessary documentation and version upgrades have been applied
- [ ] docs were regenerated (unless this PR does not include any code or yml updates)
- [ ] BuildKite integration tests are passing
- [ ] Detailed validation steps have been provided below
- [ ] The appropriate issue has been linked, tagged, and properly assigned.
- [ ] All necessary documentation and version upgrades have been applied.
<!--- Be sure to update the package version in the dbt_project.yml, integration_tests/dbt_project.yml, and README if necessary. -->
- [ ] docs were regenerated (unless this PR does not include any code or yml updates).
- [ ] BuildKite integration tests are passing.
- [ ] Detailed validation steps have been provided below.

### Detailed Validation
Please share any and all of your validation steps:
Expand Down
32 changes: 32 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,35 @@
# dbt_jira v0.17.0
[PR #127](https://github.com/fivetran/dbt_jira/pull/127) contains the following updates:

## 🚨 Breaking Changes 🚨
> ⚠️ Since the following changes are breaking, a `--full-refresh` after upgrading will be required.
- To reduce storage, updated the default materialization of the upstream staging models to views. (See the [dbt_jira_source CHANGELOG](https://github.com/fivetran/dbt_jira_source/blob/main/CHANGELOG.md#dbt_jira_source-v070) for more details.)

## Performance improvements (🚨 Breaking Changes 🚨)
- Updated the incremental strategy of the following models to `insert_overwrite` for BigQuery and Databricks All Purpose Cluster destinations and `delete+insert` for all other supported destinations.
- `int_jira__issue_calendar_spine`
- `int_jira__pivot_daily_field_history`
- `jira__daily_issue_field_history`
> At this time, models for Databricks SQL Warehouse destinations are materialized as tables without support for incremental runs.
- Removed intermediate models `int_jira__agg_multiselect_history`, `int_jira__combine_field_histories`, and `int_jira__daily_field_history` by combining them with `int_jira__pivot_daily_field_history`. This is to reduce the redundancy of the data stored in tables, the number of full scans, and the volume of write operations.
- Note that if you have previously run this package, these models may still exist in your destination schema, however they will no longer be updated.
- Updated the default materialization of `int_jira__issue_type_parents` from a table to a view. This model is called only in `int_jira__issue_users`, so a view will reduce storage requirements while not significantly hindering performance.
- For Snowflake and BigQuery destinations, added the following `cluster_by` columns to the configs for incremental models:
- `int_jira__issue_calendar_spine` clustering on columns `['date_day', 'issue_id']`
- `int_jira__pivot_daily_field_history` clustering on columns `['valid_starting_on', 'issue_id']`
- `jira__daily_issue_field_history` clustering on columns `['date_day', 'issue_id']`
- For Databricks All Purpose Cluster destinations, updated incremental model file formats to `parquet` for compatibility with the `insert_overwrite` strategy.

## Features
- Added a default 3-day look-back to incremental models to accommodate late arriving records. The number of days can be changed by setting the var `lookback_window` in your dbt_project.yml. See the [Lookback Window section of the README](https://github.com/fivetran/dbt_jira/blob/main/README.md#lookback-window) for more details.
- Added macro `jira_lookback` to streamline the lookback window calculation.

## Under the Hood:
- Added integration testing pipeline for Databricks SQL Warehouse.
- Added macro `jira_is_databricks_sql_warehouse` for detecting if a Databricks target is an All Purpose Cluster or a SQL Warehouse.
- Updated the maintainer pull request template.

# dbt_jira v0.16.0
[PR #122](https://github.com/fivetran/dbt_jira/pull/122) contains the following updates:

Expand Down
25 changes: 23 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,23 @@ dispatch:
search_order: ['spark_utils', 'dbt_utils']
```
### Database Incremental Strategies
Models in this package that are materialized incrementally are configured to work with the different strategies available to each supported warehouse.
For **BigQuery** and **Databricks All Purpose Cluster runtime** destinations, we have chosen `insert_overwrite` as the default strategy, which benefits from the partitioning capability.
> For Databricks SQL Warehouse destinations, models are materialized as tables without support for incremental runs.

For **Snowflake**, **Redshift**, and **Postgres** databases, we have chosen `delete+insert` as the default strategy.

> Regardless of strategy, we recommend that users periodically run a `--full-refresh` to ensure a high level of data quality.

## Step 2: Install the package
Include the following jira package version in your `packages.yml` file:
> TIP: Check [dbt Hub](https://hub.getdbt.com/) for the latest installation instructions or [read the dbt docs](https://docs.getdbt.com/docs/package-management) for more information on installing packages.
```yaml
packages:
- package: fivetran/jira
version: [">=0.16.0", "<0.17.0"]
version: [">=0.17.0", "<0.18.0"]
```
## Step 3: Define database and schema variables
Expand Down Expand Up @@ -131,6 +141,17 @@ vars:
jira_<default_source_table_name>_identifier: your_table_name
```

### Lookback Window
Records from the source can sometimes arrive late. Since several of the models in this package are incremental, by default we look back 3 days to ensure late arrivals are captured while avoiding the need for frequent full refreshes. While the frequency can be reduced, we still recommend running `dbt --full-refresh` periodically to maintain data quality of the models.

To change the default lookback window, add the following variable to your `dbt_project.yml` file:

```yml
vars:
jira:
lookback_window: number_of_days # default is 3
```

## (Optional) Step 6: Orchestrate your models with Fivetran Transformations for dbt Core™
<details><summary>Expand for details</summary>
<br>
Expand All @@ -145,7 +166,7 @@ This dbt package is dependent on the following dbt packages. Please be aware tha
```yml
packages:
- package: fivetran/jira_source
version: [">=0.6.0", "<0.7.0"]
version: [">=0.7.0", "<0.8.0"]
- package: fivetran/fivetran_utils
version: [">=0.4.0", "<0.5.0"]
Expand Down
2 changes: 1 addition & 1 deletion dbt_project.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: 'jira'
version: '0.16.0'
version: '0.17.0'
config-version: 2
require-dbt-version: [">=1.3.0", "<2.0.0"]
vars:
Expand Down
2 changes: 1 addition & 1 deletion docs/catalog.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/manifest.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/run_results.json

Large diffs are not rendered by default.

10 changes: 9 additions & 1 deletion integration_tests/ci/sample.profiles.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ integration_tests:
host: "{{ env_var('CI_DATABRICKS_DBT_HOST') }}"
http_path: "{{ env_var('CI_DATABRICKS_DBT_HTTP_PATH') }}"
schema: jira_integrations_tests_41
threads: 2
threads: 8
token: "{{ env_var('CI_DATABRICKS_DBT_TOKEN') }}"
type: databricks
databricks-sql:
catalog: "{{ env_var('CI_DATABRICKS_DBT_CATALOG') }}"
host: "{{ env_var('CI_DATABRICKS_DBT_HOST') }}"
http_path: "{{ env_var('CI_DATABRICKS_SQL_DBT_HTTP_PATH') }}"
schema: jira_integrations_tests_sqlw
threads: 8
token: "{{ env_var('CI_DATABRICKS_SQL_DBT_TOKEN') }}"
type: databricks
14 changes: 9 additions & 5 deletions integration_tests/dbt_project.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: 'jira_integration_tests'
version: '0.16.0'
version: '0.17.0'
config-version: 2
profile: 'integration_tests'

Expand Down Expand Up @@ -30,6 +30,10 @@ vars:

issue_field_history_columns: ['summary', 'story points', 'components']

models:
jira:
+schema: "{{ 'jira_integrations_tests_sqlw' if target.name == 'databricks-sql' else 'jira' }}"

seeds:
jira_integration_tests:
+column_types:
Expand Down Expand Up @@ -57,17 +61,17 @@ seeds:
work_ratio: float
resolved: timestamp
updated: timestamp
assignee: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks') else 'varchar' }}"
assignee: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks', 'databricks-sql') else 'varchar' }}"
issue_multiselect_history:
+column_types:
time: timestamp
issue_id: "{{ 'int64' if target.name == 'bigquery' else 'bigint' }}"
value: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks') else 'varchar' }}"
value: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks', 'databricks-sql') else 'varchar' }}"
issue_field_history:
+column_types:
time: timestamp
issue_id: "{{ 'int64' if target.name == 'bigquery' else 'bigint' }}"
value: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks') else 'varchar' }}"
value: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks', 'databricks-sql') else 'varchar' }}"
sprint:
+column_types:
id: "{{ 'int64' if target.name == 'bigquery' else 'bigint' }}"
Expand All @@ -76,7 +80,7 @@ seeds:
start_date: timestamp
field:
+column_types:
id: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks') else 'varchar' }}"
id: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks', 'databricks-sql') else 'varchar' }}"

dispatch:
- macro_namespace: dbt_utils
Expand Down
2 changes: 1 addition & 1 deletion integration_tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ dbt-redshift>=1.3.0,<2.0.0
dbt-postgres>=1.3.0,<2.0.0
dbt-spark>=1.3.0,<2.0.0
dbt-spark[PyHive]>=1.3.0,<2.0.0
dbt-databricks>=1.3.0,<2.0.0
dbt-databricks>=1.3.0,<2.0.0
15 changes: 15 additions & 0 deletions macros/jira_is_databricks_sql_warehouse.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{% macro jira_is_databricks_sql_warehouse() %}
{% if target.type in ('databricks') %}
{% set re = modules.re %}
{% set path_match = target.http_path %}
{% set regex_pattern = "sql/.+/warehouses/" %}
{% set match_result = re.search(regex_pattern, path_match) %}
{% if match_result %}
{{ return(True) }}
{% else %}
{{ return(False) }}
{% endif %}
{% else %}
{{ return(False) }}
{% endif %}
{% endmacro %}
18 changes: 18 additions & 0 deletions macros/jira_lookback.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{% macro jira_lookback(from_date, datepart, interval, safety_date='2010-01-01') %}

{{ adapter.dispatch('jira_lookback', 'jira') (from_date, datepart, interval, safety_date='2010-01-01') }}

{%- endmacro %}

{% macro default__jira_lookback(from_date, datepart, interval, safety_date='2010-01-01') %}

{% set sql_statement %}
select coalesce({{ from_date }}, {{ "'" ~ safety_date ~ "'" }})
from {{ this }}
{%- endset -%}

{%- set result = dbt_utils.get_single_value(sql_statement) %}

{{ dbt.dateadd(datepart=datepart, interval=-interval, from_date_or_timestamp="cast('" ~ result ~ "' as date)") }}

{% endmacro %}

This file was deleted.

Loading

0 comments on commit 251785a

Please sign in to comment.