From 40656a23de95f0d1f0f29afa8ff889a896971efd Mon Sep 17 00:00:00 2001 From: lcaggio Date: Tue, 16 May 2023 14:08:03 +0200 Subject: [PATCH] Minimal Data Platform - Make components optional (#1380) make some components optional: Composer and Data Proc history server. --- .../data-platform-minimal/01-landing.tf | 2 +- .../data-platform-minimal/02-composer.tf | 6 +-- .../data-platform-minimal/02-dataproc.tf | 28 +++++++------ .../data-platform-minimal/02-processing.tf | 2 +- .../data-platform-minimal/03-curated.tf | 6 +-- .../data-platform-minimal/04-common.tf | 6 +-- .../data-platform-minimal/README.md | 41 ++++++++++--------- .../demo/orchestrate_pyspark.py | 10 ++--- .../data-platform-minimal/outputs.tf | 17 +++----- .../data-platform-minimal/variables.tf | 16 ++++++-- 10 files changed, 70 insertions(+), 64 deletions(-) diff --git a/blueprints/data-solutions/data-platform-minimal/01-landing.tf b/blueprints/data-solutions/data-platform-minimal/01-landing.tf index c6edddd070..48eb9969c0 100644 --- a/blueprints/data-solutions/data-platform-minimal/01-landing.tf +++ b/blueprints/data-solutions/data-platform-minimal/01-landing.tf @@ -18,7 +18,7 @@ locals { iam_lnd = { "roles/storage.objectCreator" = [module.land-sa-cs-0.iam_email] "roles/storage.objectViewer" = [module.processing-sa-cmp-0.iam_email] - "roles/storage.objectAdmin" = [module.processing-sa-dp-0.iam_email] + "roles/storage.objectAdmin" = [module.processing-sa-0.iam_email] } } diff --git a/blueprints/data-solutions/data-platform-minimal/02-composer.tf b/blueprints/data-solutions/data-platform-minimal/02-composer.tf index 616d80ad00..de7d1738bc 100644 --- a/blueprints/data-solutions/data-platform-minimal/02-composer.tf +++ b/blueprints/data-solutions/data-platform-minimal/02-composer.tf @@ -25,10 +25,10 @@ locals { GCP_REGION = var.region LAND_PRJ = module.land-project.project_id LAND_GCS = module.land-cs-0.name - PHS_CLUSTER_NAME = module.processing-dp-historyserver.name + PHS_CLUSTER_NAME = try(module.processing-dp-historyserver[0].name, null) PROCESSING_GCS = module.processing-cs-0.name PROCESSING_PRJ = module.processing-project.project_id - PROCESSING_SA_DP = module.processing-sa-dp-0.email + PROCESSING_SA = module.processing-sa-0.email PROCESSING_SUBNET = local.processing_subnet PROCESSING_VPC = local.processing_vpc } @@ -47,7 +47,7 @@ module "processing-sa-cmp-0" { } resource "google_composer_environment" "processing-cmp-0" { - count = var.composer_config.disable_deployment == true ? 0 : 1 + count = var.enable_services.composer == true ? 1 : 0 project = module.processing-project.project_id name = "${var.prefix}-prc-cmp-0" region = var.region diff --git a/blueprints/data-solutions/data-platform-minimal/02-dataproc.tf b/blueprints/data-solutions/data-platform-minimal/02-dataproc.tf index 1161abf018..4275c559a4 100644 --- a/blueprints/data-solutions/data-platform-minimal/02-dataproc.tf +++ b/blueprints/data-solutions/data-platform-minimal/02-dataproc.tf @@ -14,7 +14,8 @@ # tfdoc:file:description Cloud Dataproc resources. -module "processing-cs-dp-history" { +module "processing-dp-history" { + count = var.enable_services.dataproc_history_server == true ? 1 : 0 source = "../../../modules/gcs" project_id = module.processing-project.project_id prefix = var.prefix @@ -24,12 +25,12 @@ module "processing-cs-dp-history" { encryption_key = var.service_encryption_keys.storage } -module "processing-sa-dp-0" { +module "processing-sa-0" { source = "../../../modules/iam-service-account" project_id = module.processing-project.project_id prefix = var.prefix - name = "prc-dp-0" - display_name = "Dataproc service account" + name = "prc-0" + display_name = "Processing service account" iam = { "roles/iam.serviceAccountTokenCreator" = [ local.groups_iam.data-engineers, @@ -41,7 +42,7 @@ module "processing-sa-dp-0" { } } -module "processing-dp-staging-0" { +module "processing-staging-0" { source = "../../../modules/gcs" project_id = module.processing-project.project_id prefix = var.prefix @@ -51,7 +52,7 @@ module "processing-dp-staging-0" { encryption_key = var.service_encryption_keys.storage } -module "processing-dp-temp-0" { +module "processing-temp-0" { source = "../../../modules/gcs" project_id = module.processing-project.project_id prefix = var.prefix @@ -61,7 +62,7 @@ module "processing-dp-temp-0" { encryption_key = var.service_encryption_keys.storage } -module "processing-dp-log-0" { +module "processing-log-0" { source = "../../../modules/gcs" project_id = module.processing-project.project_id prefix = var.prefix @@ -72,19 +73,20 @@ module "processing-dp-log-0" { } module "processing-dp-historyserver" { + count = var.enable_services.dataproc_history_server == true ? 1 : 0 source = "../../../modules/dataproc" project_id = module.processing-project.project_id - name = "hystory-server" + name = "history-server" prefix = var.prefix region = var.region dataproc_config = { cluster_config = { - staging_bucket = module.processing-dp-staging-0.name - temp_bucket = module.processing-dp-temp-0.name + staging_bucket = module.processing-staging-0.name + temp_bucket = module.processing-temp-0.name gce_cluster_config = { subnetwork = module.processing-vpc[0].subnets["${var.region}/${var.prefix}-processing"].self_link zone = "${var.region}-b" - service_account = module.processing-sa-dp-0.email + service_account = module.processing-sa-0.email service_account_scopes = ["cloud-platform"] internal_ip_only = true } @@ -99,10 +101,10 @@ module "processing-dp-historyserver" { "dataproc:dataproc.allow.zero.workers" = "true" "dataproc:job.history.to-gcs.enabled" = "true" "spark:spark.history.fs.logDirectory" = ( - "gs://${module.processing-dp-staging-0.name}/*/spark-job-history" + "gs://${module.processing-staging-0.name}/*/spark-job-history" ) "spark:spark.eventLog.dir" = ( - "gs://${module.processing-dp-staging-0.name}/*/spark-job-history" + "gs://${module.processing-staging-0.name}/*/spark-job-history" ) "spark:spark.history.custom.executor.log.url.applyIncompleteApplication" = "false" "spark:spark.history.custom.executor.log.url" = ( diff --git a/blueprints/data-solutions/data-platform-minimal/02-processing.tf b/blueprints/data-solutions/data-platform-minimal/02-processing.tf index 9bbb623430..6dbd13331b 100644 --- a/blueprints/data-solutions/data-platform-minimal/02-processing.tf +++ b/blueprints/data-solutions/data-platform-minimal/02-processing.tf @@ -28,7 +28,7 @@ locals { module.processing-sa-cmp-0.iam_email ] "roles/dataproc.worker" = [ - module.processing-sa-dp-0.iam_email + module.processing-sa-0.iam_email ] "roles/iam.serviceAccountUser" = [ module.processing-sa-cmp-0.iam_email, local.groups_iam.data-engineers diff --git a/blueprints/data-solutions/data-platform-minimal/03-curated.tf b/blueprints/data-solutions/data-platform-minimal/03-curated.tf index 5b044b51ef..730e8d6cb7 100644 --- a/blueprints/data-solutions/data-platform-minimal/03-curated.tf +++ b/blueprints/data-solutions/data-platform-minimal/03-curated.tf @@ -16,13 +16,13 @@ locals { cur_iam = { - "roles/bigquery.dataOwner" = [module.processing-sa-dp-0.iam_email] + "roles/bigquery.dataOwner" = [module.processing-sa-0.iam_email] "roles/bigquery.dataViewer" = [ local.groups_iam.data-analysts, local.groups_iam.data-engineers ] "roles/bigquery.jobUser" = [ - module.processing-sa-dp-0.iam_email, + module.processing-sa-0.iam_email, local.groups_iam.data-analysts, local.groups_iam.data-engineers ] @@ -35,7 +35,7 @@ locals { "roles/storage.objectViewer" = [ local.groups_iam.data-analysts, local.groups_iam.data-engineers ] - "roles/storage.objectAdmin" = [module.processing-sa-dp-0.iam_email] + "roles/storage.objectAdmin" = [module.processing-sa-0.iam_email] } cur_services = [ "iam.googleapis.com", diff --git a/blueprints/data-solutions/data-platform-minimal/04-common.tf b/blueprints/data-solutions/data-platform-minimal/04-common.tf index 3a2d01bdf0..52f6e84f07 100644 --- a/blueprints/data-solutions/data-platform-minimal/04-common.tf +++ b/blueprints/data-solutions/data-platform-minimal/04-common.tf @@ -20,16 +20,16 @@ locals { "roles/dlp.estimatesAdmin" = [local.groups_iam.data-engineers] "roles/dlp.reader" = [local.groups_iam.data-engineers] "roles/dlp.user" = [ - module.processing-sa-dp-0.iam_email, + module.processing-sa-0.iam_email, local.groups_iam.data-engineers ] "roles/datacatalog.admin" = [local.groups_iam.data-security] "roles/datacatalog.viewer" = [ - module.processing-sa-dp-0.iam_email, + module.processing-sa-0.iam_email, local.groups_iam.data-analysts ] "roles/datacatalog.categoryFineGrainedReader" = [ - module.processing-sa-dp-0.iam_email + module.processing-sa-0.iam_email ] } } diff --git a/blueprints/data-solutions/data-platform-minimal/README.md b/blueprints/data-solutions/data-platform-minimal/README.md index a1d60f1c60..f468911310 100644 --- a/blueprints/data-solutions/data-platform-minimal/README.md +++ b/blueprints/data-solutions/data-platform-minimal/README.md @@ -230,8 +230,8 @@ network_config = { host_project = "PROJECT_ID" network_self_link = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/global/networks/NAME" subnet_self_links = { - processing_dataproc = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/regions/REGION/subnetworks/NAME" - processing_composer = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/regions/REGION/subnetworks/NAME" + processing_transformation = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/regions/REGION/subnetworks/NAME" + processing_composer = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/regions/REGION/subnetworks/NAME" } composer_ip_ranges = { cloudsql = "192.168.XXX.XXX/24" @@ -280,29 +280,30 @@ The application layer is out of scope of this script. As a demo purpuse only, on | name | description | type | required | default | |---|---|:---:|:---:|:---:| -| [organization_domain](variables.tf#L114) | Organization domain. | string | ✓ | | -| [prefix](variables.tf#L119) | Prefix used for resource names. | string | ✓ | | -| [project_config](variables.tf#L128) | Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format. | object({…}) | ✓ | | -| [composer_config](variables.tf#L17) | Cloud Composer config. | object({…}) | | {} | -| [data_catalog_tags](variables.tf#L55) | List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format. | map(map(list(string))) | | {…} | -| [data_force_destroy](variables.tf#L66) | Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage. | bool | | false | -| [groups](variables.tf#L72) | User groups. | map(string) | | {…} | -| [location](variables.tf#L82) | Location used for multi-regional resources. | string | | "eu" | -| [network_config](variables.tf#L88) | Shared VPC network configurations to use. If null networks will be created in projects. | object({…}) | | {} | -| [project_suffix](variables.tf#L152) | Suffix used only for project ids. | string | | null | -| [region](variables.tf#L158) | Region used for regional resources. | string | | "europe-west1" | -| [service_encryption_keys](variables.tf#L164) | Cloud KMS to use to encrypt different services. Key location should match service region. | object({…}) | | {} | +| [organization_domain](variables.tf#L122) | Organization domain. | string | ✓ | | +| [prefix](variables.tf#L127) | Prefix used for resource names. | string | ✓ | | +| [project_config](variables.tf#L136) | Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format. | object({…}) | ✓ | | +| [composer_config](variables.tf#L17) | Cloud Composer config. | object({…}) | | {} | +| [data_catalog_tags](variables.tf#L54) | List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format. | map(map(list(string))) | | {…} | +| [data_force_destroy](variables.tf#L65) | Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage. | bool | | false | +| [enable_services](variables.tf#L71) | Flag to enable or disable services in the Data Platform. | object({…}) | | {} | +| [groups](variables.tf#L80) | User groups. | map(string) | | {…} | +| [location](variables.tf#L90) | Location used for multi-regional resources. | string | | "eu" | +| [network_config](variables.tf#L96) | Shared VPC network configurations to use. If null networks will be created in projects. | object({…}) | | {} | +| [project_suffix](variables.tf#L160) | Suffix used only for project ids. | string | | null | +| [region](variables.tf#L166) | Region used for regional resources. | string | | "europe-west1" | +| [service_encryption_keys](variables.tf#L172) | Cloud KMS to use to encrypt different services. Key location should match service region. | object({…}) | | {} | ## Outputs | name | description | sensitive | |---|---|:---:| | [bigquery-datasets](outputs.tf#L17) | BigQuery datasets. | | -| [dataproc-hystory-server](outputs.tf#L24) | List of bucket names which have been assigned to the cluster. | | -| [gcs-buckets](outputs.tf#L34) | GCS buckets. | ✓ | -| [kms_keys](outputs.tf#L44) | Cloud MKS keys. | | -| [projects](outputs.tf#L49) | GCP Projects informations. | | -| [vpc_network](outputs.tf#L67) | VPC network. | | -| [vpc_subnet](outputs.tf#L75) | VPC subnetworks. | | +| [dataproc-history-server](outputs.tf#L24) | List of bucket names which have been assigned to the cluster. | | +| [gcs-buckets](outputs.tf#L29) | GCS buckets. | ✓ | +| [kms_keys](outputs.tf#L39) | Cloud MKS keys. | | +| [projects](outputs.tf#L44) | GCP Projects informations. | | +| [vpc_network](outputs.tf#L62) | VPC network. | | +| [vpc_subnet](outputs.tf#L70) | VPC subnetworks. | | diff --git a/blueprints/data-solutions/data-platform-minimal/demo/orchestrate_pyspark.py b/blueprints/data-solutions/data-platform-minimal/demo/orchestrate_pyspark.py index ef5084ffe2..295fdd62fc 100644 --- a/blueprints/data-solutions/data-platform-minimal/demo/orchestrate_pyspark.py +++ b/blueprints/data-solutions/data-platform-minimal/demo/orchestrate_pyspark.py @@ -41,9 +41,9 @@ PHS_CLUSTER_NAME = os.environ.get("PHS_CLUSTER_NAME") PROCESSING_GCS = os.environ.get("PROCESSING_GCS") PROCESSING_PRJ = os.environ.get("PROCESSING_PRJ") -PROCESSING_SA_DP = os.environ.get("PROCESSING_SA_DP") -PROCESSING_SA_SUBNET = os.environ.get("PROCESSING_SUBNET") -PROCESSING_SA_VPC = os.environ.get("PROCESSING_VPC") +PROCESSING_SA = os.environ.get("PROCESSING_SA") +PROCESSING_SUBNET = os.environ.get("PROCESSING_SUBNET") +PROCESSING_VPC = os.environ.get("PROCESSING_VPC") PYTHON_FILE_LOCATION = "gs://"+PROCESSING_GCS+"/pyspark_sort.py" PHS_CLUSTER_PATH = "projects/"+PROCESSING_PRJ+"/regions/"+DP_REGION+"/clusters/"+PHS_CLUSTER_NAME @@ -65,8 +65,8 @@ batch={ "environment_config": { "execution_config": { - "service_account": PROCESSING_SA_DP, - "subnetwork_uri": PROCESSING_SA_SUBNET + "service_account": PROCESSING_SA, + "subnetwork_uri": PROCESSING_SUBNET }, "peripherals_config": { "spark_history_server_config":{ diff --git a/blueprints/data-solutions/data-platform-minimal/outputs.tf b/blueprints/data-solutions/data-platform-minimal/outputs.tf index 97eda2a3f5..22e641a0a1 100644 --- a/blueprints/data-solutions/data-platform-minimal/outputs.tf +++ b/blueprints/data-solutions/data-platform-minimal/outputs.tf @@ -21,14 +21,9 @@ output "bigquery-datasets" { } } -output "dataproc-hystory-server" { +output "dataproc-history-server" { description = "List of bucket names which have been assigned to the cluster." - value = { - bucket_names = module.processing-dp-historyserver.bucket_names - http_ports = module.processing-dp-historyserver.http_ports - instance_names = module.processing-dp-historyserver.instance_names - name = module.processing-dp-historyserver.name - } + value = one(module.processing-dp-historyserver) } output "gcs-buckets" { @@ -67,15 +62,15 @@ output "projects" { output "vpc_network" { description = "VPC network." value = { - processing_dataproc = local.processing_vpc - processing_composer = local.processing_vpc + processing_transformation = local.processing_vpc + processing_composer = local.processing_vpc } } output "vpc_subnet" { description = "VPC subnetworks." value = { - processing_dataproc = local.processing_subnet - processing_composer = local.processing_subnet + processing_transformation = local.processing_subnet + processing_composer = local.processing_subnet } } diff --git a/blueprints/data-solutions/data-platform-minimal/variables.tf b/blueprints/data-solutions/data-platform-minimal/variables.tf index a63f07c38f..e6b62df6f3 100644 --- a/blueprints/data-solutions/data-platform-minimal/variables.tf +++ b/blueprints/data-solutions/data-platform-minimal/variables.tf @@ -17,8 +17,7 @@ variable "composer_config" { description = "Cloud Composer config." type = object({ - disable_deployment = optional(bool, false) - environment_size = optional(string, "ENVIRONMENT_SIZE_SMALL") + environment_size = optional(string, "ENVIRONMENT_SIZE_SMALL") software_config = optional(object({ airflow_config_overrides = optional(map(string), {}) pypi_packages = optional(map(string), {}) @@ -69,6 +68,15 @@ variable "data_force_destroy" { default = false } +variable "enable_services" { + description = "Flag to enable or disable services in the Data Platform." + type = object({ + composer = optional(bool, true) + dataproc_history_server = optional(bool, true) + }) + default = {} +} + variable "groups" { description = "User groups." type = map(string) @@ -91,8 +99,8 @@ variable "network_config" { host_project = optional(string) network_self_link = optional(string) subnet_self_links = optional(object({ - processing_dataproc = string - processing_composer = string + processing_transformation = string + processing_composer = string }), null) composer_ip_ranges = optional(object({ connection_subnetwork = optional(string)