Skip to content

Commit

Permalink
[Feature] Update data platform blue print with Dataflow Flex template (
Browse files Browse the repository at this point in the history
…#1105)

* Add initial dataflow template code + TF infra

* Refactor the datapipeline DAG to use flex template operator, cleanup code

* Remove unneeded bash scripts, update README with manual examples

* Refactor datapipeline_dc_tags.py and include new Flex template

* Update docs to reflect changes

* Remove sub-dependencies and keep apache beam

* Add missing license headers and update tests

* Set resouces to 291 in tests

* Update outputs via tfdoc

* Update with outputs order and tfdoc

* Correct number of resources

* Fix to add region into command from var

* Enable service account impersonation for running builds

* Update example dataflow run command to use orchestrator SA

* Remove hard coded values in example

* Keep original airflow files, add new which use Flex template as example

* Update tests and doc

* Fix number of resources in plan

* Run tfdoc remove files section in README

* Fix number of modules in tfdoc

* Update number of resources

* Add missin service account

* Update DF demo README

* Quick rename

---------

Co-authored-by: lcaggio <[email protected]>
Co-authored-by: Ludovico Magnocavallo <[email protected]>
  • Loading branch information
3 people authored Feb 6, 2023
1 parent 884ec71 commit 02d8d83
Show file tree
Hide file tree
Showing 16 changed files with 1,152 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ locals {
LOD_SA_DF = module.load-sa-df-0.email
ORC_PRJ = module.orch-project.project_id
ORC_GCS = module.orch-cs-0.url
ORC_GCS_TMP_DF = module.orch-cs-df-template.url
TRF_PRJ = module.transf-project.project_id
TRF_GCS_STAGING = module.transf-cs-df-0.url
TRF_NET_VPC = local.transf_vpc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ locals {
? var.network_config.network_self_link
: module.orch-vpc.0.self_link
)

# Note: This formatting is needed for output purposes since the fabric artifact registry
# module doesn't yet expose the docker usage path of a registry folder in the needed format.
orch_docker_path = format("%s-docker.pkg.dev/%s/%s",
var.region, module.orch-project.project_id, module.orch-artifact-reg.name)
}

module "orch-project" {
Expand All @@ -44,6 +49,8 @@ module "orch-project" {
"roles/iam.serviceAccountUser",
"roles/storage.objectAdmin",
"roles/storage.admin",
"roles/artifactregistry.admin",
"roles/serviceusage.serviceUsageConsumer",
]
}
iam = {
Expand All @@ -65,7 +72,15 @@ module "orch-project" {
]
"roles/storage.objectAdmin" = [
module.orch-sa-cmp-0.iam_email,
module.orch-sa-df-build.iam_email,
"serviceAccount:${module.orch-project.service_accounts.robots.composer}",
"serviceAccount:${module.orch-project.service_accounts.robots.cloudbuild}",
]
"roles/artifactregistry.reader" = [
module.load-sa-df-0.iam_email,
]
"roles/cloudbuild.serviceAgent" = [
module.orch-sa-df-build.iam_email,
]
"roles/storage.objectViewer" = [module.load-sa-df-0.iam_email]
}
Expand All @@ -81,6 +96,7 @@ module "orch-project" {
"compute.googleapis.com",
"container.googleapis.com",
"containerregistry.googleapis.com",
"artifactregistry.googleapis.com",
"dataflow.googleapis.com",
"orgpolicy.googleapis.com",
"pubsub.googleapis.com",
Expand Down Expand Up @@ -148,3 +164,46 @@ module "orch-nat" {
region = var.region
router_network = module.orch-vpc.0.name
}

module "orch-artifact-reg" {
source = "../../../modules/artifact-registry"
project_id = module.orch-project.project_id
id = "${var.prefix}-app-images"
location = var.region
format = "DOCKER"
description = "Docker repository storing application images e.g. Dataflow, Cloud Run etc..."
}

module "orch-cs-df-template" {
source = "../../../modules/gcs"
project_id = module.orch-project.project_id
prefix = var.prefix
name = "orc-cs-df-template"
location = var.region
storage_class = "REGIONAL"
encryption_key = try(local.service_encryption_keys.storage, null)
}

module "orch-cs-build-staging" {
source = "../../../modules/gcs"
project_id = module.orch-project.project_id
prefix = var.prefix
name = "orc-cs-build-staging"
location = var.region
storage_class = "REGIONAL"
encryption_key = try(local.service_encryption_keys.storage, null)
}

module "orch-sa-df-build" {
source = "../../../modules/iam-service-account"
project_id = module.orch-project.project_id
prefix = var.prefix
name = "orc-sa-df-build"
display_name = "Data platform Dataflow build service account"
# Note values below should pertain to the system / group / users who are able to
# invoke the build via this service account
iam = {
"roles/iam.serviceAccountTokenCreator" = [local.groups_iam.data-engineers]
"roles/iam.serviceAccountUser" = [local.groups_iam.data-engineers]
}
}
6 changes: 4 additions & 2 deletions blueprints/data-solutions/data-platform-foundations/IAM.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,13 @@ Legend: <code>+</code> additive, <code>•</code> conditional.

| members | roles |
|---|---|
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/cloudbuild.builds.editor](https://cloud.google.com/iam/docs/understanding-roles#cloudbuild.builds.editor) <br>[roles/composer.admin](https://cloud.google.com/iam/docs/understanding-roles#composer.admin) <br>[roles/composer.environmentAndStorageObjectAdmin](https://cloud.google.com/iam/docs/understanding-roles#composer.environmentAndStorageObjectAdmin) <br>[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser) <br>[roles/iap.httpsResourceAccessor](https://cloud.google.com/iam/docs/understanding-roles#iap.httpsResourceAccessor) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) <br>[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
|<b>gcp-data-engineers</b><br><small><i>group</i></small>|[roles/artifactregistry.admin](https://cloud.google.com/iam/docs/understanding-roles#artifactregistry.admin) <br>[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/cloudbuild.builds.editor](https://cloud.google.com/iam/docs/understanding-roles#cloudbuild.builds.editor) <br>[roles/composer.admin](https://cloud.google.com/iam/docs/understanding-roles#composer.admin) <br>[roles/composer.environmentAndStorageObjectAdmin](https://cloud.google.com/iam/docs/understanding-roles#composer.environmentAndStorageObjectAdmin) <br>[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser) <br>[roles/iap.httpsResourceAccessor](https://cloud.google.com/iam/docs/understanding-roles#iap.httpsResourceAccessor) <br>[roles/serviceusage.serviceUsageConsumer](https://cloud.google.com/iam/docs/understanding-roles#serviceusage.serviceUsageConsumer) <br>[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) <br>[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
|<b>SERVICE_IDENTITY_cloudcomposer-accounts</b><br><small><i>serviceAccount</i></small>|[roles/composer.ServiceAgentV2Ext](https://cloud.google.com/iam/docs/understanding-roles#composer.ServiceAgentV2Ext) <br>[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
|<b>SERVICE_IDENTITY_gcp-sa-cloudbuild</b><br><small><i>serviceAccount</i></small>|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
|<b>SERVICE_IDENTITY_service-networking</b><br><small><i>serviceAccount</i></small>|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) <code>+</code>|
|<b>load-df-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
|<b>load-df-0</b><br><small><i>serviceAccount</i></small>|[roles/artifactregistry.reader](https://cloud.google.com/iam/docs/understanding-roles#artifactregistry.reader) <br>[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) <br>[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) |
|<b>orc-cmp-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser) <br>[roles/composer.worker](https://cloud.google.com/iam/docs/understanding-roles#composer.worker) <br>[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser) <br>[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
|<b>orc-sa-df-build</b><br><small><i>serviceAccount</i></small>|[roles/cloudbuild.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#cloudbuild.serviceAgent) <br>[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) |
|<b>trf-df-0</b><br><small><i>serviceAccount</i></small>|[roles/bigquery.dataEditor](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataEditor) |

## Project <i>trf</i>
Expand Down
17 changes: 9 additions & 8 deletions blueprints/data-solutions/data-platform-foundations/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ module "data-platform" {
prefix = "myprefix"
}
# tftest modules=39 resources=287
# tftest modules=43 resources=297
```

## Customizations
Expand Down Expand Up @@ -263,13 +263,14 @@ You can find examples in the `[demo](./demo)` folder.

| name | description | sensitive |
|---|---|:---:|
| [bigquery-datasets](outputs.tf#L17) | BigQuery datasets. | |
| [demo_commands](outputs.tf#L27) | Demo commands. Relevant only if Composer is deployed. | |
| [gcs-buckets](outputs.tf#L40) | GCS buckets. | |
| [kms_keys](outputs.tf#L53) | Cloud MKS keys. | |
| [projects](outputs.tf#L58) | GCP Projects informations. | |
| [vpc_network](outputs.tf#L84) | VPC network. | |
| [vpc_subnet](outputs.tf#L93) | VPC subnetworks. | |
| [bigquery-datasets](outputs.tf#L16) | BigQuery datasets. | |
| [demo_commands](outputs.tf#L26) | Demo commands. Relevant only if Composer is deployed. | |
| [df_template](outputs.tf#L49) | Dataflow template image and template details. | |
| [gcs-buckets](outputs.tf#L58) | GCS buckets. | |
| [kms_keys](outputs.tf#L71) | Cloud MKS keys. | |
| [projects](outputs.tf#L76) | GCP Projects informations. | |
| [vpc_network](outputs.tf#L102) | VPC network. | |
| [vpc_subnet](outputs.tf#L111) | VPC subnetworks. | |

<!-- END TFDOC -->
## TODOs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ Below you can find a description of each example:
## Running the demo
To run demo examples, please follow the following steps:

- 01: copy sample data to the `drop off` Cloud Storage bucket impersonating the `load` service account.
- 02: copy sample data structure definition in the `orchestration` Cloud Storage bucket impersonating the `orchestration` service account.
- 03: copy the Cloud Composer DAG to the Cloud Composer Storage bucket impersonating the `orchestration` service account.
- 04: Open the Cloud Composer Airflow UI and run the imported DAG.
- 05: Run the BigQuery query to see results.
- 01: Copy sample data to the `drop off` Cloud Storage bucket impersonating the `load` service account.
- 02: Copy sample data structure definition in the `orchestration` Cloud Storage bucket impersonating the `orchestration` service account.
- 03: Copy the Cloud Composer DAG to the Cloud Composer Storage bucket impersonating the `orchestration` service account.
- 04: Build the Dataflow Flex template and image via a Cloud Build pipeline
- 05: Open the Cloud Composer Airflow UI and run the imported DAG.
- 06: Run the BigQuery query to see results.

You can find pre-computed commands in the `demo_commands` output variable of the deployed terraform [data pipeline](../).
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

FROM gcr.io/dataflow-templates-base/python39-template-launcher-base

ENV FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE="/template/requirements.txt"
ENV FLEX_TEMPLATE_PYTHON_PY_FILE="/template/csv2bq.py"

COPY ./src/ /template

RUN apt-get update \
&& apt-get install -y libffi-dev git \
&& rm -rf /var/lib/apt/lists/* \
&& pip install --no-cache-dir --upgrade pip \
&& pip install --no-cache-dir -r $FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE \
&& pip download --no-cache-dir --dest /tmp/dataflow-requirements-cache -r $FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE

ENV PIP_NO_DEPS=True
Loading

0 comments on commit 02d8d83

Please sign in to comment.