Updates

snowplow · Nov 13, 2024 · ffff3fb · ffff3fb
1 parent fb55b41
commit ffff3fb
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 5 deletions.
diff --git a/integration_tests/.scripts/integration_tests.sh b/integration_tests/.scripts/integration_tests.sh
@@ -23,8 +23,8 @@ fi
 
 for db in ${DATABASES[@]}; do
 
-  if [[ "$db" == "bigquery" || "$db" == "spark_iceberg" ]]; then
-      echo "Snowplow web integration tests: Seeding data and doing first run"
+  if [[ "$db" == "bigquery" ]]; then
+      echo "Snowplow integration tests: Seeding data and doing first run"
 
       eval "dbt seed --target $db --full-refresh" || exit 1
       eval "dbt run --target $db --full-refresh" || exit 1

diff --git a/integration_tests/macros/test_normalize_events.sql b/integration_tests/macros/test_normalize_events.sql
@@ -122,6 +122,8 @@ It runs 9 tests:
 
 {% macro spark__test_normalize_events() %}
 
+    -- Main difference here is that spark doesnt need the catalog in the from clause
+
     {% set expected_dict = {
         "flat_cols_only" : "select event_id , collector_tstamp , DATE(collector_tstamp) as collector_tstamp_date -- Flat columns from event table , app_id -- self describing events columns from event table -- context column(s) from the event table from "~target.schema~"_scratch.snowplow_normalize_base_events_this_run where event_name in ('event_name')",
         "sde_plus_cols" : "select event_id , collector_tstamp , DATE(collector_tstamp) as collector_tstamp_date -- Flat columns from event table , app_id -- self describing events columns from event table , UNSTRUCT_EVENT_TEST_1.test_id as test_id , UNSTRUCT_EVENT_TEST_1.test_class as test_class -- context column(s) from the event table from "~target.schema~"_scratch.snowplow_normalize_base_events_this_run where event_name in ('event_name')",

diff --git a/integration_tests/macros/test_users_table.sql b/integration_tests/macros/test_users_table.sql
@@ -72,7 +72,7 @@ It runs 6 tests:
 
 
 {% macro spark__test_users_table() %}
-
+    -- Main difference here is that spark doesnt need the catalog in the from clause
     {% set expected_dict = {
             "1_context" : "with defined_user_id as ( select user_id as user_id , collector_tstamp as latest_collector_tstamp , DATE(collector_tstamp) as latest_collector_tstamp_date -- Flat columns from event table -- user column(s) from the event table , CONTEXTS_TEST_1[0].context_test_id as context_test_id , CONTEXTS_TEST_1[0].context_test_class as context_test_class from "~target.schema~"_scratch.snowplow_normalize_base_events_this_run where 1 = 1 ), users_ordering as ( select a.* , row_number() over (partition by user_id order by latest_collector_tstamp desc) as rn from defined_user_id a where user_id is not null ) select * except (rn) from users_ordering where rn = 1",
             "2_context" : "with defined_user_id as ( select user_id as user_id , collector_tstamp as latest_collector_tstamp , DATE(collector_tstamp) as latest_collector_tstamp_date -- Flat columns from event table -- user column(s) from the event table , CONTEXTS_TEST_1[0].context_test_id as context_test_id , CONTEXTS_TEST_1[0].context_test_class as context_test_class , CONTEXT_TEST2_1[0].context_test_id2 as context_test_id2 , CONTEXT_TEST2_1[0].context_test_class2 as context_test_class2 from "~target.schema~"_scratch.snowplow_normalize_base_events_this_run where 1 = 1 ), users_ordering as ( select a.* , row_number() over (partition by user_id order by latest_collector_tstamp desc) as rn from defined_user_id a where user_id is not null ) select * except (rn) from users_ordering where rn = 1",

diff --git a/models/base/scratch/snowplow_normalize_base_events_this_run.sql b/models/base/scratch/snowplow_normalize_base_events_this_run.sql
@@ -14,7 +14,7 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0
 
 {%- set lower_limit, upper_limit, session_start_limit = snowplow_utils.return_base_new_event_limits(ref('snowplow_normalize_base_new_event_limits')) %}
 
-with final_cte AS (
+with prep AS (
 
   select
       a.*
@@ -38,13 +38,14 @@ with final_cte AS (
       and a.derived_tstamp <= {{ upper_limit }}
     {% endif %}
     and {{ snowplow_utils.app_id_filter(var("snowplow__app_id",[])) }}
+  -- We are doing the branching in order not to do the qualify in the case of spark, as it does not support it
   {% if target.type in ['databricks','snowflake','bigquery'] %}
   qualify row_number() over (partition by a.event_id order by a.collector_tstamp{% if target.type in ['databricks', 'spark'] -%}, a.etl_tstamp {%- endif %}) = 1
   {% endif %}
 )
 
 SELECT *
-FROM final_cte
+FROM prep
 {% if target.type not in ['databricks','snowflake','bigquery'] %}
 WHERE rn = 1
 {% endif %}