Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lineage on Minimal Data Platform blueprint #1679

Merged
merged 6 commits into from
Sep 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ module "land-project" {
"bigquerystorage.googleapis.com",
"cloudkms.googleapis.com",
"cloudresourcemanager.googleapis.com",
"datalineage.googleapis.com",
"iam.googleapis.com",
"serviceusage.googleapis.com",
"stackdriver.googleapis.com",
Expand Down
12 changes: 8 additions & 4 deletions blueprints/data-solutions/data-platform-minimal/02-composer.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,20 @@ module "processing-sa-cmp-0" {
}

resource "google_composer_environment" "processing-cmp-0" {
count = var.enable_services.composer == true ? 1 : 0
project = module.processing-project.project_id
name = "${var.prefix}-prc-cmp-0"
region = var.region
count = var.enable_services.composer == true ? 1 : 0
provider = google-beta
project = module.processing-project.project_id
name = "${var.prefix}-prc-cmp-0"
region = var.region
config {
software_config {
airflow_config_overrides = var.composer_config.software_config.airflow_config_overrides
pypi_packages = var.composer_config.software_config.pypi_packages
env_variables = local.env_variables
image_version = var.composer_config.software_config.image_version
cloud_data_lineage_integration {
enabled = var.composer_config.software_config.cloud_data_lineage_integration
}
}
workloads_config {
scheduler {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ module "processing-project" {
"compute.googleapis.com",
"container.googleapis.com",
"dataflow.googleapis.com",
"datalineage.googleapis.com",
"dataproc.googleapis.com",
"iam.googleapis.com",
"servicenetworking.googleapis.com",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ locals {
"cloudkms.googleapis.com",
"cloudresourcemanager.googleapis.com",
"compute.googleapis.com",
"datalineage.googleapis.com",
"iam.googleapis.com",
"servicenetworking.googleapis.com",
"serviceusage.googleapis.com",
Expand Down
28 changes: 14 additions & 14 deletions blueprints/data-solutions/data-platform-minimal/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ module "data-platform" {
prefix = "myprefix"
}

# tftest modules=23 resources=135
# tftest modules=23 resources=138
```

## Customizations
Expand Down Expand Up @@ -302,19 +302,19 @@ The application layer is out of scope of this script. As a demo purpuse only, on

| name | description | type | required | default |
|---|---|:---:|:---:|:---:|
| [organization_domain](variables.tf#L122) | Organization domain. | <code>string</code> | ✓ | |
| [prefix](variables.tf#L127) | Prefix used for resource names. | <code>string</code> | ✓ | |
| [project_config](variables.tf#L136) | Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format. | <code title="object&#40;&#123;&#10; billing_account_id &#61; optional&#40;string, null&#41;&#10; parent &#61; string&#10; project_ids &#61; optional&#40;object&#40;&#123;&#10; landing &#61; string&#10; processing &#61; string&#10; curated &#61; string&#10; common &#61; string&#10; &#125;&#41;, &#123;&#10; landing &#61; &#34;lnd&#34;&#10; processing &#61; &#34;prc&#34;&#10; curated &#61; &#34;cur&#34;&#10; common &#61; &#34;cmn&#34;&#10; &#125;&#10; &#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | ✓ | |
| [composer_config](variables.tf#L17) | Cloud Composer config. | <code title="object&#40;&#123;&#10; environment_size &#61; optional&#40;string, &#34;ENVIRONMENT_SIZE_SMALL&#34;&#41;&#10; software_config &#61; optional&#40;object&#40;&#123;&#10; airflow_config_overrides &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; pypi_packages &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; env_variables &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; image_version &#61; optional&#40;string, &#34;composer-2-airflow-2&#34;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10; web_server_access_control &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; workloads_config &#61; optional&#40;object&#40;&#123;&#10; scheduler &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; count &#61; optional&#40;number, 1&#41;&#10; &#125;&#10; &#41;, &#123;&#125;&#41;&#10; web_server &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10; worker &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; min_count &#61; optional&#40;number, 1&#41;&#10; max_count &#61; optional&#40;number, 3&#41;&#10; &#125;&#10; &#41;, &#123;&#125;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [data_catalog_tags](variables.tf#L55) | List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format. | <code title="map&#40;object&#40;&#123;&#10; description &#61; optional&#40;string&#41;&#10; iam &#61; optional&#40;map&#40;list&#40;string&#41;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;&#41;">map&#40;object&#40;&#123;&#8230;&#125;&#41;&#41;</code> | | <code title="&#123;&#10; &#34;3_Confidential&#34; &#61; &#123;&#125;&#10; &#34;2_Private&#34; &#61; &#123;&#125;&#10; &#34;1_Sensitive&#34; &#61; &#123;&#125;&#10;&#125;">&#123;&#8230;&#125;</code> |
| [data_force_destroy](variables.tf#L69) | Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage. | <code>bool</code> | | <code>false</code> |
| [enable_services](variables.tf#L75) | Flag to enable or disable services in the Data Platform. | <code title="object&#40;&#123;&#10; composer &#61; optional&#40;bool, true&#41;&#10; dataproc_history_server &#61; optional&#40;bool, true&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [groups](variables.tf#L84) | User groups. | <code>map&#40;string&#41;</code> | | <code title="&#123;&#10; data-analysts &#61; &#34;gcp-data-analysts&#34;&#10; data-engineers &#61; &#34;gcp-data-engineers&#34;&#10; data-security &#61; &#34;gcp-data-security&#34;&#10;&#125;">&#123;&#8230;&#125;</code> |
| [location](variables.tf#L94) | Location used for multi-regional resources. | <code>string</code> | | <code>&#34;eu&#34;</code> |
| [network_config](variables.tf#L100) | Shared VPC network configurations to use. If null networks will be created in projects. | <code title="object&#40;&#123;&#10; host_project &#61; optional&#40;string&#41;&#10; network_self_link &#61; optional&#40;string&#41;&#10; subnet_self_link &#61; optional&#40;string&#41;&#10; composer_ip_ranges &#61; optional&#40;object&#40;&#123;&#10; connection_subnetwork &#61; optional&#40;string&#41;&#10; cloud_sql &#61; optional&#40;string, &#34;10.20.10.0&#47;24&#34;&#41;&#10; gke_master &#61; optional&#40;string, &#34;10.20.11.0&#47;28&#34;&#41;&#10; pods_range_name &#61; optional&#40;string, &#34;pods&#34;&#41;&#10; services_range_name &#61; optional&#40;string, &#34;services&#34;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [project_suffix](variables.tf#L160) | Suffix used only for project ids. | <code>string</code> | | <code>null</code> |
| [region](variables.tf#L166) | Region used for regional resources. | <code>string</code> | | <code>&#34;europe-west1&#34;</code> |
| [service_encryption_keys](variables.tf#L172) | Cloud KMS to use to encrypt different services. Key location should match service region. | <code title="object&#40;&#123;&#10; bq &#61; optional&#40;string&#41;&#10; composer &#61; optional&#40;string&#41;&#10; compute &#61; optional&#40;string&#41;&#10; storage &#61; optional&#40;string&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [organization_domain](variables.tf#L123) | Organization domain. | <code>string</code> | ✓ | |
| [prefix](variables.tf#L128) | Prefix used for resource names. | <code>string</code> | ✓ | |
| [project_config](variables.tf#L137) | Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format. | <code title="object&#40;&#123;&#10; billing_account_id &#61; optional&#40;string, null&#41;&#10; parent &#61; string&#10; project_ids &#61; optional&#40;object&#40;&#123;&#10; landing &#61; string&#10; processing &#61; string&#10; curated &#61; string&#10; common &#61; string&#10; &#125;&#41;, &#123;&#10; landing &#61; &#34;lnd&#34;&#10; processing &#61; &#34;prc&#34;&#10; curated &#61; &#34;cur&#34;&#10; common &#61; &#34;cmn&#34;&#10; &#125;&#10; &#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | ✓ | |
| [composer_config](variables.tf#L17) | Cloud Composer config. | <code title="object&#40;&#123;&#10; environment_size &#61; optional&#40;string, &#34;ENVIRONMENT_SIZE_SMALL&#34;&#41;&#10; software_config &#61; optional&#40;object&#40;&#123;&#10; airflow_config_overrides &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; pypi_packages &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; env_variables &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; image_version &#61; optional&#40;string, &#34;composer-2-airflow-2&#34;&#41;&#10; cloud_data_lineage_integration &#61; optional&#40;bool, true&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10; web_server_access_control &#61; optional&#40;map&#40;string&#41;, &#123;&#125;&#41;&#10; workloads_config &#61; optional&#40;object&#40;&#123;&#10; scheduler &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; count &#61; optional&#40;number, 1&#41;&#10; &#125;&#10; &#41;, &#123;&#125;&#41;&#10; web_server &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10; worker &#61; optional&#40;object&#40;&#123;&#10; cpu &#61; optional&#40;number, 0.5&#41;&#10; memory_gb &#61; optional&#40;number, 1.875&#41;&#10; storage_gb &#61; optional&#40;number, 1&#41;&#10; min_count &#61; optional&#40;number, 1&#41;&#10; max_count &#61; optional&#40;number, 3&#41;&#10; &#125;&#10; &#41;, &#123;&#125;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [data_catalog_tags](variables.tf#L56) | List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format. | <code title="map&#40;object&#40;&#123;&#10; description &#61; optional&#40;string&#41;&#10; iam &#61; optional&#40;map&#40;list&#40;string&#41;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;&#41;">map&#40;object&#40;&#123;&#8230;&#125;&#41;&#41;</code> | | <code title="&#123;&#10; &#34;3_Confidential&#34; &#61; &#123;&#125;&#10; &#34;2_Private&#34; &#61; &#123;&#125;&#10; &#34;1_Sensitive&#34; &#61; &#123;&#125;&#10;&#125;">&#123;&#8230;&#125;</code> |
| [data_force_destroy](variables.tf#L70) | Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage. | <code>bool</code> | | <code>false</code> |
| [enable_services](variables.tf#L76) | Flag to enable or disable services in the Data Platform. | <code title="object&#40;&#123;&#10; composer &#61; optional&#40;bool, true&#41;&#10; dataproc_history_server &#61; optional&#40;bool, true&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [groups](variables.tf#L85) | User groups. | <code>map&#40;string&#41;</code> | | <code title="&#123;&#10; data-analysts &#61; &#34;gcp-data-analysts&#34;&#10; data-engineers &#61; &#34;gcp-data-engineers&#34;&#10; data-security &#61; &#34;gcp-data-security&#34;&#10;&#125;">&#123;&#8230;&#125;</code> |
| [location](variables.tf#L95) | Location used for multi-regional resources. | <code>string</code> | | <code>&#34;eu&#34;</code> |
| [network_config](variables.tf#L101) | Shared VPC network configurations to use. If null networks will be created in projects. | <code title="object&#40;&#123;&#10; host_project &#61; optional&#40;string&#41;&#10; network_self_link &#61; optional&#40;string&#41;&#10; subnet_self_link &#61; optional&#40;string&#41;&#10; composer_ip_ranges &#61; optional&#40;object&#40;&#123;&#10; connection_subnetwork &#61; optional&#40;string&#41;&#10; cloud_sql &#61; optional&#40;string, &#34;10.20.10.0&#47;24&#34;&#41;&#10; gke_master &#61; optional&#40;string, &#34;10.20.11.0&#47;28&#34;&#41;&#10; pods_range_name &#61; optional&#40;string, &#34;pods&#34;&#41;&#10; services_range_name &#61; optional&#40;string, &#34;services&#34;&#41;&#10; &#125;&#41;, &#123;&#125;&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |
| [project_suffix](variables.tf#L161) | Suffix used only for project ids. | <code>string</code> | | <code>null</code> |
| [region](variables.tf#L167) | Region used for regional resources. | <code>string</code> | | <code>&#34;europe-west1&#34;</code> |
| [service_encryption_keys](variables.tf#L173) | Cloud KMS to use to encrypt different services. Key location should match service region. | <code title="object&#40;&#123;&#10; bq &#61; optional&#40;string&#41;&#10; composer &#61; optional&#40;string&#41;&#10; compute &#61; optional&#40;string&#41;&#10; storage &#61; optional&#40;string&#41;&#10;&#125;&#41;">object&#40;&#123;&#8230;&#125;&#41;</code> | | <code>&#123;&#125;</code> |

## Outputs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,5 @@ source ./env.sh
gsutil -i $LND_SA cp demo/data/*.csv gs://$LND_GCS
gsutil -i $CMP_SA cp demo/data/*.j* gs://$PRC_GCS
gsutil -i $CMP_SA cp demo/pyspark_* gs://$PRC_GCS
gsutil -i $CMP_SA cp demo/dag_*.py $CMP_GCS
gsutil -i $CMP_SA cp demo/dag_*.py gs://$CMP_GCS/dags
```
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
schema_update_options=['ALLOW_FIELD_RELAXATION', 'ALLOW_FIELD_ADDITION'],
schema_object="customers.json",
schema_object_bucket=PROCESSING_GCS[5:],
project_id=PROCESSING_PRJ, # The process will continue to run on the dataset project until the Apache Airflow bug is fixed. https://github.com/apache/airflow/issues/32106
project_id=PROCESSING_PRJ,
impersonation_chain=[PROCESSING_SA]
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
CURATED_PRJ = Variable.get("CURATED_PRJ")
DP_KMS_KEY = Variable.get("DP_KMS_KEY", "")
DP_REGION = Variable.get("DP_REGION")
GCP_REGION = Variable.get("GCP_REGION")
LAND_PRJ = Variable.get("LAND_PRJ")
LAND_BQ_DATASET = Variable.get("LAND_BQ_DATASET")
LAND_GCS = Variable.get("LAND_GCS")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
CURATED_PRJ = Variable.get("CURATED_PRJ")
DP_KMS_KEY = Variable.get("DP_KMS_KEY", "")
DP_REGION = Variable.get("DP_REGION")
GCP_REGION = Variable.get("GCP_REGION")
LAND_PRJ = Variable.get("LAND_PRJ")
LAND_BQ_DATASET = Variable.get("LAND_BQ_DATASET")
LAND_GCS = Variable.get("LAND_GCS")
Expand Down
9 changes: 5 additions & 4 deletions blueprints/data-solutions/data-platform-minimal/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ variable "composer_config" {
type = object({
environment_size = optional(string, "ENVIRONMENT_SIZE_SMALL")
software_config = optional(object({
airflow_config_overrides = optional(map(string), {})
pypi_packages = optional(map(string), {})
env_variables = optional(map(string), {})
image_version = optional(string, "composer-2-airflow-2")
airflow_config_overrides = optional(map(string), {})
pypi_packages = optional(map(string), {})
env_variables = optional(map(string), {})
image_version = optional(string, "composer-2-airflow-2")
cloud_data_lineage_integration = optional(bool, true)
}), {})
web_server_access_control = optional(map(string), {})
workloads_config = optional(object({
Expand Down