From 93cc28bb492e0ef1cae03a0fb626d32252efdffc Mon Sep 17 00:00:00 2001 From: Eduardo Filho Date: Tue, 6 Dec 2022 17:36:27 -0500 Subject: [PATCH 1/3] glam-dev: no subdag from histogram_bucket_counts (#1599) --- dags/glam_dev.py | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/dags/glam_dev.py b/dags/glam_dev.py index e60be11b1..02d3a036f 100644 --- a/dags/glam_dev.py +++ b/dags/glam_dev.py @@ -293,27 +293,42 @@ dag=dag, ) +# Testing without SubDag because it keeps getting stuck on "running" +# and not actually executing anything. Also, they're known for causing deadlocks in +# Celelery (might be our case) thus are discouraged. +clients_histogram_bucket_counts = bigquery_etl_query( + task_id="clients_histogram_bucket_counts", + destination_table="clients_histogram_bucket_counts_v1", + dataset_id=dev_dataset_id, + project_id=prod_project_id, + owner="efilho@mozilla.com", + parameters=("submission_date:DATE:{{ds}}",), + arguments=("--replace",), + dag=dag, + docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/glam-dev-bigquery-etl:latest", +) + # SubdagOperator uses a SequentialExecutor by default # so its tasks will run sequentially. # Note: In 2.0, SubDagOperator is changed to use airflow scheduler instead of # backfill to schedule tasks in the subdag. User no longer need to specify # the executor in SubDagOperator. (We don't but the assumption that Sequential # Executor is used is now wrong) -clients_histogram_bucket_counts = SubDagOperator( - subdag=repeated_subdag( - GLAM_DAG, - "clients_histogram_bucket_counts", - default_args, - dag.schedule_interval, - dev_dataset_id, - ("submission_date:DATE:{{ds}}",), - 25, - None, - docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/glam-dev-bigquery-etl:latest", - ), - task_id="clients_histogram_bucket_counts", - dag=dag, -) +#clients_histogram_bucket_counts = SubDagOperator( +# subdag=repeated_subdag( +# GLAM_DAG, +# "clients_histogram_bucket_counts", +# default_args, +# dag.schedule_interval, +# dev_dataset_id, +# ("submission_date:DATE:{{ds}}",), +# 25, +# None, +# docker_image="gcr.io/moz-fx-data-airflow-prod-88e0/glam-dev-bigquery-etl:latest", +# ), +# task_id="clients_histogram_bucket_counts", +# dag=dag, +#) clients_histogram_probe_counts = bigquery_etl_query( task_id="clients_histogram_probe_counts", From d0fbecd14c3f9db682333a1448e565ca9189e4f8 Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Tue, 6 Dec 2022 17:27:16 -0800 Subject: [PATCH 2/3] Add airflow tasks for deploying views and new tables from bigquery-etl (#1601) Co-authored-by: Anna Scholtz --- dags/bqetl_artifact_deployment.py | 72 +++++++++++++++++++++++++++++++ dags/mozfun.py | 35 --------------- 2 files changed, 72 insertions(+), 35 deletions(-) create mode 100644 dags/bqetl_artifact_deployment.py delete mode 100644 dags/mozfun.py diff --git a/dags/bqetl_artifact_deployment.py b/dags/bqetl_artifact_deployment.py new file mode 100644 index 000000000..e7a7178bf --- /dev/null +++ b/dags/bqetl_artifact_deployment.py @@ -0,0 +1,72 @@ +""" +Nightly deploy of bigquery etl views. +""" + +from airflow import DAG +from datetime import timedelta, datetime +from utils.gcp import gke_command +from utils.tags import Tag + +default_args = { + "owner": "ascholtz@mozilla.com", + "email": [ + "ascholtz@mozilla.com", + "dthorn@mozilla.com", + "telemetry-alerts@mozilla.com", + ], + "depends_on_past": False, + "start_date": datetime(2022, 12, 6), + "email_on_failure": True, + "email_on_retry": True, + "retries": 2, + "retry_delay": timedelta(minutes=30), +} + +tags = [Tag.ImpactTier.tier_1] + +with DAG("bqetl_artifact_deployment", default_args=default_args, schedule_interval="@daily", doc_md=__doc__, tags=tags,) as dag: + docker_image = "gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest" + + publish_public_udfs = gke_command( + task_id="publish_public_udfs", + command=["script/publish_public_udfs"], + docker_image=docker_image + ) + + publish_persistent_udfs = gke_command( + task_id="publish_persistent_udfs", + cmds=["bash", "-c"], + command=[ + "script/publish_persistent_udfs --target-project=moz-fx-data-shared-prod && " + "script/publish_persistent_udfs --target-project=mozdata" + ], + docker_image=docker_image, + ) + + publish_new_tables = gke_command( + task_id="publish_new_tables", + cmds=["bash", "-c"], + command=[ + "script/bqetl generate all && " + "script/bqetl query schema update '*' &&" + "script/bqetl query schema deploy '*' --skip-existing" + ], + docker_image=docker_image, + ) + + publish_views = gke_command( + task_id="publish_views", + cmds=["bash", "-c"], + command=[ + "script/bqetl generate all && " + "script/bqetl view publish --target-project=moz-fx-data-shared-prod && " + "script/bqetl view publish --target-project=mozdata --user-facing-only && " + "script/publish_public_data_views --target-project=moz-fx-data-shared-prod && " + "script/publish_public_data_views --target-project=mozdata" + ], + docker_image=docker_image, + ) + + publish_views.set_upstream(publish_public_udfs) + publish_views.set_upstream(publish_persistent_udfs) + publish_views.set_upstream(publish_new_tables) diff --git a/dags/mozfun.py b/dags/mozfun.py deleted file mode 100644 index ecb46edac..000000000 --- a/dags/mozfun.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Nightly deploy of `mozfun` UDFs. - -This job has elevated permissions to be able to create new datasets in the mozfun -project when new logical namespaces are created in the mozfun directory of bigquery-etl. -This is the reason that individual data engineers cannot deploy UDFs to the mozfun project -on their own, hence the need for this nightly task. -""" - -from airflow import DAG -from datetime import timedelta, datetime -from utils.gcp import gke_command -from utils.tags import Tag - -default_args = { - "owner": "ascholtz@mozilla.com", - "email": ["ascholtz@mozilla.com"], - "depends_on_past": False, - "start_date": datetime(2020, 6, 11), - "email_on_failure": True, - "email_on_retry": True, - "retries": 2, - "retry_delay": timedelta(minutes=30), -} - -tags = [Tag.ImpactTier.tier_1] - -with DAG("mozfun", default_args=default_args, schedule_interval="@daily", doc_md=__doc__, tags=tags,) as dag: - docker_image = "gcr.io/moz-fx-data-airflow-prod-88e0/bigquery-etl:latest" - - publish_public_udfs = gke_command( - task_id="publish_public_udfs", - command=["script/publish_public_udfs"], - docker_image=docker_image - ) From 88eea5e70c70bb42f7e51521e0bcf7032c9c37a2 Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Tue, 6 Dec 2022 17:47:36 -0800 Subject: [PATCH 3/3] Fix parameter name in bqetl artifact deployment (#1604) --- dags/bqetl_artifact_deployment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dags/bqetl_artifact_deployment.py b/dags/bqetl_artifact_deployment.py index e7a7178bf..4e007810a 100644 --- a/dags/bqetl_artifact_deployment.py +++ b/dags/bqetl_artifact_deployment.py @@ -37,8 +37,8 @@ task_id="publish_persistent_udfs", cmds=["bash", "-c"], command=[ - "script/publish_persistent_udfs --target-project=moz-fx-data-shared-prod && " - "script/publish_persistent_udfs --target-project=mozdata" + "script/publish_persistent_udfs --project-id=moz-fx-data-shared-prod && " + "script/publish_persistent_udfs --project-id=mozdata" ], docker_image=docker_image, )