Feature/performance enhancement (#127)

* feature/performance-enhancement
fivetran · Apr 30, 2024 · 251785a · 251785a
1 parent ec9fec4
commit 251785a
Show file tree

Hide file tree

Showing 27 changed files with 329 additions and 369 deletions.
diff --git a/.buildkite/hooks/pre-command b/.buildkite/hooks/pre-command
@@ -22,4 +22,6 @@ export CI_SNOWFLAKE_DBT_WAREHOUSE=$(gcloud secrets versions access latest --secr
 export CI_DATABRICKS_DBT_HOST=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_HOST" --project="dbt-package-testing-363917")
 export CI_DATABRICKS_DBT_HTTP_PATH=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_HTTP_PATH" --project="dbt-package-testing-363917")
 export CI_DATABRICKS_DBT_TOKEN=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_TOKEN" --project="dbt-package-testing-363917")
-export CI_DATABRICKS_DBT_CATALOG=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_CATALOG" --project="dbt-package-testing-363917")
+export CI_DATABRICKS_DBT_CATALOG=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_CATALOG" --project="dbt-package-testing-363917")
+export CI_DATABRICKS_SQL_DBT_HTTP_PATH=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_SQL_DBT_HTTP_PATH" --project="dbt-package-testing-363917")
+export CI_DATABRICKS_SQL_DBT_TOKEN=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_SQL_DBT_TOKEN" --project="dbt-package-testing-363917")
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -72,3 +72,18 @@ steps:
             - "CI_DATABRICKS_DBT_CATALOG"
     commands: |
       bash .buildkite/scripts/run_models.sh databricks
+
+  - label: ":databricks: :database: Run Tests - Databricks SQL Warehouse"
+    key: "run_dbt_databricks_sql"
+    plugins:
+      - docker#v3.13.0:
+          image: "python:3.8"
+          shell: [ "/bin/bash", "-e", "-c" ]
+          environment:
+            - "BASH_ENV=/tmp/.bashrc"
+            - "CI_DATABRICKS_DBT_HOST"
+            - "CI_DATABRICKS_SQL_DBT_HTTP_PATH"
+            - "CI_DATABRICKS_SQL_DBT_TOKEN"
+            - "CI_DATABRICKS_DBT_CATALOG"
+    commands: |
+      bash .buildkite/scripts/run_models.sh databricks-sql
diff --git a/.buildkite/scripts/run_models.sh b/.buildkite/scripts/run_models.sh
@@ -16,10 +16,26 @@ db=$1
 echo `pwd`
 cd integration_tests
 dbt deps
+
+if [ "$db" = "databricks-sql" ]; then
+dbt seed --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db" --full-refresh
+dbt compile --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db"
+dbt run --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db" --full-refresh
+dbt run --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db"
+dbt test --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db"
+dbt run --vars '{jira_schema: jira_integrations_tests_sqlw, jira_using_priorities: false, jira_using_sprints: false, jira_using_components: false, jira_using_versions: false, jira_field_grain: 'field_name'}' --target "$db" --full-refresh
+dbt run --vars '{jira_schema: jira_integrations_tests_sqlw, jira_using_priorities: false, jira_using_sprints: false, jira_using_components: false, jira_using_versions: false, jira_field_grain: 'field_name'}' --target "$db"
+dbt test --vars '{jira_schema: jira_integrations_tests_sqlw}' --target "$db"
+
+else
 dbt seed --target "$db" --full-refresh
+dbt compile --target "$db"
 dbt run --target "$db" --full-refresh
 dbt run --target "$db"
 dbt test --target "$db"
 dbt run --vars "{jira_using_priorities: false, jira_using_sprints: false, jira_using_components: false, jira_using_versions: false, jira_field_grain: 'field_name'}" --target "$db" --full-refresh
+dbt run --vars "{jira_using_priorities: false, jira_using_sprints: false, jira_using_components: false, jira_using_versions: false, jira_field_grain: 'field_name'}" --target "$db"
 dbt test --target "$db"
+fi
+
 dbt run-operation fivetran_utils.drop_schemas_automation --target "$db"
diff --git a/.github/PULL_REQUEST_TEMPLATE/maintainer_pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/maintainer_pull_request_template.md
@@ -14,11 +14,12 @@ Please acknowledge that you have successfully performed the following commands l
 - [ ] dbt run (if incremental models are present) && dbt test
 
 Before marking this PR as "ready for review" the following have been applied:
-- [ ] The appropriate issue has been linked, tagged, and properly assigned
-- [ ] All necessary documentation and version upgrades have been applied
-- [ ] docs were regenerated (unless this PR does not include any code or yml updates)
-- [ ] BuildKite integration tests are passing
-- [ ] Detailed validation steps have been provided below
+- [ ] The appropriate issue has been linked, tagged, and properly assigned.
+- [ ] All necessary documentation and version upgrades have been applied.
+    <!--- Be sure to update the package version in the dbt_project.yml, integration_tests/dbt_project.yml, and README if necessary. -->
+- [ ] docs were regenerated (unless this PR does not include any code or yml updates).
+- [ ] BuildKite integration tests are passing.
+- [ ] Detailed validation steps have been provided below.
 
 ### Detailed Validation
 Please share any and all of your validation steps:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,35 @@
+# dbt_jira v0.17.0
+[PR #127](https://github.com/fivetran/dbt_jira/pull/127) contains the following updates:
+
+## 🚨 Breaking Changes 🚨
+> ⚠️ Since the following changes are breaking, a `--full-refresh` after upgrading will be required.
+- To reduce storage, updated the default materialization of the upstream staging models to views. (See the [dbt_jira_source CHANGELOG](https://github.com/fivetran/dbt_jira_source/blob/main/CHANGELOG.md#dbt_jira_source-v070) for more details.)
+
+## Performance improvements (🚨 Breaking Changes 🚨)
+  - Updated the incremental strategy of the following models to `insert_overwrite` for BigQuery and Databricks All Purpose Cluster destinations and `delete+insert` for all other supported destinations. 
+    - `int_jira__issue_calendar_spine`
+    - `int_jira__pivot_daily_field_history`
+    - `jira__daily_issue_field_history`
+    > At this time, models for Databricks SQL Warehouse destinations are materialized as tables without support for incremental runs.
+
+  - Removed intermediate models `int_jira__agg_multiselect_history`, `int_jira__combine_field_histories`, and `int_jira__daily_field_history` by combining them with `int_jira__pivot_daily_field_history`. This is to reduce the redundancy of the data stored in tables, the number of full scans, and the volume of write operations.
+    - Note that if you have previously run this package, these models may still exist in your destination schema, however they will no longer be updated. 
+  - Updated the default materialization of `int_jira__issue_type_parents` from a table to a view. This model is called only in `int_jira__issue_users`, so a view will reduce storage requirements while not significantly hindering performance.
+  - For Snowflake and BigQuery destinations, added the following `cluster_by` columns to the configs for incremental models:
+    - `int_jira__issue_calendar_spine` clustering on columns `['date_day', 'issue_id']`
+    - `int_jira__pivot_daily_field_history` clustering on columns `['valid_starting_on', 'issue_id']`
+    - `jira__daily_issue_field_history` clustering on columns `['date_day', 'issue_id']`
+  - For Databricks All Purpose Cluster destinations, updated incremental model file formats to `parquet` for compatibility with the `insert_overwrite` strategy.
+
+## Features
+- Added a default 3-day look-back to incremental models to accommodate late arriving records. The number of days can be changed by setting the var `lookback_window` in your dbt_project.yml. See the [Lookback Window section of the README](https://github.com/fivetran/dbt_jira/blob/main/README.md#lookback-window) for more details.
+- Added macro `jira_lookback` to streamline the lookback window calculation.
+
+## Under the Hood:
+- Added integration testing pipeline for Databricks SQL Warehouse.
+- Added macro `jira_is_databricks_sql_warehouse` for detecting if a Databricks target is an All Purpose Cluster or a SQL Warehouse.
+- Updated the maintainer pull request template.
+
 # dbt_jira v0.16.0
 [PR #122](https://github.com/fivetran/dbt_jira/pull/122) contains the following updates:
 

diff --git a/README.md b/README.md
@@ -50,13 +50,23 @@ dispatch:
     search_order: ['spark_utils', 'dbt_utils']
 ```
 
+### Database Incremental Strategies 
+Models in this package that are materialized incrementally are configured to work with the different strategies available to each supported warehouse.
+
+For **BigQuery** and **Databricks All Purpose Cluster runtime** destinations, we have chosen `insert_overwrite` as the default strategy, which benefits from the partitioning capability. 
+> For Databricks SQL Warehouse destinations, models are materialized as tables without support for incremental runs.
+
+For **Snowflake**, **Redshift**, and **Postgres** databases, we have chosen `delete+insert` as the default strategy.  
+
+> Regardless of strategy, we recommend that users periodically run a `--full-refresh` to ensure a high level of data quality.
+
 ## Step 2: Install the package
 Include the following jira package version in your `packages.yml` file:
 > TIP: Check [dbt Hub](https://hub.getdbt.com/) for the latest installation instructions or [read the dbt docs](https://docs.getdbt.com/docs/package-management) for more information on installing packages.
 ```yaml
 packages:
   - package: fivetran/jira
-    version: [">=0.16.0", "<0.17.0"]
+    version: [">=0.17.0", "<0.18.0"]
 
 ```
 ## Step 3: Define database and schema variables
@@ -131,6 +141,17 @@ vars:
     jira_<default_source_table_name>_identifier: your_table_name 
 ```
 
+### Lookback Window
+Records from the source can sometimes arrive late. Since several of the models in this package are incremental, by default we look back 3 days to ensure late arrivals are captured while avoiding the need for frequent full refreshes. While the frequency can be reduced, we still recommend running `dbt --full-refresh` periodically to maintain data quality of the models.
+
+To change the default lookback window, add the following variable to your `dbt_project.yml` file:
+
+```yml
+vars:
+  jira:
+    lookback_window: number_of_days # default is 3
+```
+
 ## (Optional) Step 6: Orchestrate your models with Fivetran Transformations for dbt Core™
 <details><summary>Expand for details</summary>
 <br>
@@ -145,7 +166,7 @@ This dbt package is dependent on the following dbt packages. Please be aware tha
 ```yml
 packages:
     - package: fivetran/jira_source
-      version: [">=0.6.0", "<0.7.0"]
+      version: [">=0.7.0", "<0.8.0"]
 
     - package: fivetran/fivetran_utils
       version: [">=0.4.0", "<0.5.0"]

diff --git a/dbt_project.yml b/dbt_project.yml
@@ -1,5 +1,5 @@
 name: 'jira'
-version: '0.16.0'
+version: '0.17.0'
 config-version: 2
 require-dbt-version: [">=1.3.0", "<2.0.0"]
 vars:

diff --git a/docs/catalog.json b/docs/catalog.json
diff --git a/docs/manifest.json b/docs/manifest.json
diff --git a/docs/run_results.json b/docs/run_results.json
diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml
@@ -49,6 +49,14 @@ integration_tests:
       host: "{{ env_var('CI_DATABRICKS_DBT_HOST') }}"
       http_path: "{{ env_var('CI_DATABRICKS_DBT_HTTP_PATH') }}"
       schema: jira_integrations_tests_41
-      threads: 2
+      threads: 8
       token: "{{ env_var('CI_DATABRICKS_DBT_TOKEN') }}"
       type: databricks
+    databricks-sql:
+      catalog: "{{ env_var('CI_DATABRICKS_DBT_CATALOG') }}"
+      host: "{{ env_var('CI_DATABRICKS_DBT_HOST') }}"
+      http_path: "{{ env_var('CI_DATABRICKS_SQL_DBT_HTTP_PATH') }}"
+      schema: jira_integrations_tests_sqlw 
+      threads: 8
+      token: "{{ env_var('CI_DATABRICKS_SQL_DBT_TOKEN') }}"
+      type: databricks
diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml
@@ -1,5 +1,5 @@
 name: 'jira_integration_tests'
-version: '0.16.0'
+version: '0.17.0'
 config-version: 2
 profile: 'integration_tests'
 
@@ -30,6 +30,10 @@ vars:
 
   issue_field_history_columns: ['summary', 'story points', 'components']
 
+models:
+  jira:
+    +schema: "{{ 'jira_integrations_tests_sqlw' if target.name == 'databricks-sql' else 'jira' }}"
+
 seeds:
   jira_integration_tests:
     +column_types:
@@ -57,17 +61,17 @@ seeds:
         work_ratio: float
         resolved: timestamp
         updated: timestamp
-        assignee: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks') else 'varchar' }}"
+        assignee: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks', 'databricks-sql') else 'varchar' }}"
     issue_multiselect_history:
       +column_types:
         time: timestamp
         issue_id: "{{ 'int64' if target.name == 'bigquery' else 'bigint' }}"
-        value: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks') else 'varchar' }}"
+        value: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks', 'databricks-sql') else 'varchar' }}"
     issue_field_history:
       +column_types:
         time: timestamp
         issue_id: "{{ 'int64' if target.name == 'bigquery' else 'bigint' }}"
-        value: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks') else 'varchar' }}"
+        value: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks', 'databricks-sql') else 'varchar' }}"
     sprint: 
       +column_types:
         id: "{{ 'int64' if target.name == 'bigquery' else 'bigint' }}"
@@ -76,7 +80,7 @@ seeds:
         start_date: timestamp
     field: 
       +column_types:
-        id: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks') else 'varchar' }}"
+        id: "{{ 'string' if target.name in ('bigquery', 'spark', 'databricks', 'databricks-sql') else 'varchar' }}"
 
 dispatch:
   - macro_namespace: dbt_utils

diff --git a/integration_tests/requirements.txt b/integration_tests/requirements.txt
@@ -4,4 +4,4 @@ dbt-redshift>=1.3.0,<2.0.0
 dbt-postgres>=1.3.0,<2.0.0
 dbt-spark>=1.3.0,<2.0.0
 dbt-spark[PyHive]>=1.3.0,<2.0.0
-dbt-databricks>=1.3.0,<2.0.0
+dbt-databricks>=1.3.0,<2.0.0
diff --git a/macros/jira_is_databricks_sql_warehouse.sql b/macros/jira_is_databricks_sql_warehouse.sql
@@ -0,0 +1,15 @@
+{% macro jira_is_databricks_sql_warehouse() %}
+    {% if target.type in ('databricks') %}
+        {% set re = modules.re %}
+        {% set path_match = target.http_path %}
+        {% set regex_pattern = "sql/.+/warehouses/" %}
+        {% set match_result = re.search(regex_pattern, path_match) %}
+        {% if match_result %}
+            {{ return(True) }}
+        {% else %}
+            {{ return(False) }}
+        {% endif %}
+    {% else %}
+        {{ return(False) }}
+    {% endif %}
+{% endmacro %}
diff --git a/macros/jira_lookback.sql b/macros/jira_lookback.sql
@@ -0,0 +1,18 @@
+{% macro jira_lookback(from_date, datepart, interval, safety_date='2010-01-01') %}
+
+{{ adapter.dispatch('jira_lookback', 'jira') (from_date, datepart, interval, safety_date='2010-01-01') }}
+
+{%- endmacro %}
+
+{% macro default__jira_lookback(from_date, datepart, interval, safety_date='2010-01-01')  %}
+
+    {% set sql_statement %}
+        select coalesce({{ from_date }}, {{ "'" ~ safety_date ~ "'" }})
+        from {{ this }}
+    {%- endset -%}
+
+    {%- set result = dbt_utils.get_single_value(sql_statement) %}
+
+    {{ dbt.dateadd(datepart=datepart, interval=-interval, from_date_or_timestamp="cast('" ~ result ~ "' as date)") }}
+
+{% endmacro %}
diff --git a/models/intermediate/field_history/int_jira__agg_multiselect_history.sql b/models/intermediate/field_history/int_jira__agg_multiselect_history.sql