diff --git a/blueprints/README.md b/blueprints/README.md index 24610e1bba..b2636dbcbe 100644 --- a/blueprints/README.md +++ b/blueprints/README.md @@ -6,7 +6,7 @@ Currently available blueprints: - **apigee** - [Apigee Hybrid on GKE](./apigee/hybrid-gke/), [Apigee X analytics in BigQuery](./apigee/bigquery-analytics), [Apigee network patterns](./apigee/network-patterns/) - **cloud operations** - [Active Directory Federation Services](./cloud-operations/adfs), [Cloud Asset Inventory feeds for resource change tracking and remediation](./cloud-operations/asset-inventory-feed-remediation), [Fine-grained Cloud DNS IAM via Service Directory](./cloud-operations/dns-fine-grained-iam), [Cloud DNS & Shared VPC design](./cloud-operations/dns-shared-vpc), [Delegated Role Grants](./cloud-operations/iam-delegated-role-grants), [Networking Dashboard](./cloud-operations/network-dashboard), [Managing on-prem service account keys by uploading public keys](./cloud-operations/onprem-sa-key-management), [Compute Image builder with Hashicorp Packer](./cloud-operations/packer-image-builder), [Packer example](./cloud-operations/packer-image-builder/packer), [Compute Engine quota monitoring](./cloud-operations/quota-monitoring), [Scheduled Cloud Asset Inventory Export to Bigquery](./cloud-operations/scheduled-asset-inventory-export-bq), [Configuring workload identity federation with Terraform Cloud/Enterprise workflows](./cloud-operations/terraform-cloud-dynamic-credentials), [TCP healthcheck and restart for unmanaged GCE instances](./cloud-operations/unmanaged-instances-healthcheck), [Migrate for Compute Engine (v5) blueprints](./cloud-operations/vm-migration), [Configuring workload identity federation to access Google Cloud resources from apps running on Azure](./cloud-operations/workload-identity-federation) -- **data solutions** - [GCE and GCS CMEK via centralized Cloud KMS](./data-solutions/cmek-via-centralized-kms), [Cloud Composer version 2 private instance, supporting Shared VPC and external CMEK key](./data-solutions/composer-2), [Cloud SQL instance with multi-region read replicas](./data-solutions/cloudsql-multiregion), [Data Platform](./data-solutions/data-platform-foundations), [Spinning up a foundation data pipeline on Google Cloud using Cloud Storage, Dataflow and BigQuery](./data-solutions/gcs-to-bq-with-least-privileges), [#SQL Server Always On Groups blueprint](./data-solutions/sqlserver-alwayson), [Data Playground](./data-solutions/data-playground), [MLOps with Vertex AI](./data-solutions/vertex-mlops), [Shielded Folder](./data-solutions/shielded-folder), [BigQuery ML and Vertex AI Pipeline](./data-solutions/bq-ml) +- **data solutions** - [GCE and GCS CMEK via centralized Cloud KMS](./data-solutions/cmek-via-centralized-kms), [Cloud Composer version 2 private instance, supporting Shared VPC and external CMEK key](./data-solutions/composer-2), [Cloud SQL instance with multi-region read replicas](./data-solutions/cloudsql-multiregion), [Data Platform](./data-solutions/data-platform-foundations), [Minimal Data Platform](./data-solutions/data-platform-minimal), [Spinning up a foundation data pipeline on Google Cloud using Cloud Storage, Dataflow and BigQuery](./data-solutions/gcs-to-bq-with-least-privileges), [#SQL Server Always On Groups blueprint](./data-solutions/sqlserver-alwayson), [Data Playground](./data-solutions/data-playground), [MLOps with Vertex AI](./data-solutions/vertex-mlops), [Shielded Folder](./data-solutions/shielded-folder), [BigQuery ML and Vertex AI Pipeline](./data-solutions/bq-ml) - **factories** - [The why and the how of Resource Factories](./factories), [Google Cloud Identity Group Factory](./factories/cloud-identity-group-factory), [Google Cloud BQ Factory](./factories/bigquery-factory), [Google Cloud VPC Firewall Factory](./factories/net-vpc-firewall-yaml), [Minimal Project Factory](./factories/project-factory) - **GKE** - [Binary Authorization Pipeline Blueprint](./gke/binauthz), [Storage API](./gke/binauthz/image), [Multi-cluster mesh on GKE (fleet API)](./gke/multi-cluster-mesh-gke-fleet-api), [GKE Multitenant Blueprint](./gke/multitenant-fleet), [Shared VPC with GKE support](./networking/shared-vpc-gke/), [GKE Autopilot](./gke/autopilot) - **networking** - [Calling a private Cloud Function from On-premises](./networking/private-cloud-function-from-onprem), [Decentralized firewall management](./networking/decentralized-firewall), [Decentralized firewall validator](./networking/decentralized-firewall/validator), [Network filtering with Squid](./networking/filtering-proxy), [GLB and multi-regional daisy-chaining through hybrid NEGs](./networking/glb-hybrid-neg-internal), [Hybrid connectivity to on-premise services through PSC](./networking/psc-hybrid), [HTTP Load Balancer with Cloud Armor](./networking/glb-and-armor), [Hub and Spoke via VPN](./networking/hub-and-spoke-vpn), [Hub and Spoke via VPC Peering](./networking/hub-and-spoke-peering), [Internal Load Balancer as Next Hop](./networking/ilb-next-hop), [Network filtering with Squid with isolated VPCs using Private Service Connect](./networking/filtering-proxy-psc), On-prem DNS and Google Private Access, [PSC Producer](./networking/psc-hybrid/psc-producer), [PSC Consumer](./networking/psc-hybrid/psc-consumer), [Shared VPC with optional GKE cluster](./networking/shared-vpc-gke) diff --git a/blueprints/data-solutions/README.md b/blueprints/data-solutions/README.md index 9cef8bc267..3c5af881d7 100644 --- a/blueprints/data-solutions/README.md +++ b/blueprints/data-solutions/README.md @@ -29,8 +29,15 @@ This [blueprint](./composer-2/) creates a [Cloud Composer](https://cloud.google. ### Data Platform Foundations - -This [blueprint](./data-platform-foundations/) implements a robust and flexible Data Foundation on GCP that provides opinionated defaults, allowing customers to build and scale out additional data pipelines quickly and reliably. + +This [blueprint](./data-platform-foundations/) implements a robust and flexible Data Platform on GCP that provides opinionated defaults, allowing customers to build and scale out additional data pipelines quickly and reliably. + +
+ +### Minimal Data Platform + + +This [blueprint](./data-platform-minimal/) implements a minimal Data Platform on GCP that provides opinionated defaults, allowing customers to build and scale out additional data pipelines quickly and reliably.
diff --git a/blueprints/data-solutions/data-platform-foundations/README.md b/blueprints/data-solutions/data-platform-foundations/README.md index 8b0e2344f8..e4ff871c09 100644 --- a/blueprints/data-solutions/data-platform-foundations/README.md +++ b/blueprints/data-solutions/data-platform-foundations/README.md @@ -2,6 +2,8 @@ This module implements an opinionated Data Platform Architecture that creates and setup projects and related resources that compose an end-to-end data environment. +For a minimal Data Platform, plese refer to the [Minimal Data Platform](../data-platform-minimal/) blueprint. + The code is intentionally simple, as it's intended to provide a generic initial setup and then allow easy customizations to complete the implementation of the intended design. The following diagram is a high-level reference of the resources created and managed here: diff --git a/blueprints/data-solutions/data-platform-minimal/01-landing.tf b/blueprints/data-solutions/data-platform-minimal/01-landing.tf new file mode 100644 index 0000000000..c6edddd070 --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/01-landing.tf @@ -0,0 +1,77 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tfdoc:file:description Landing project and resources. + +locals { + iam_lnd = { + "roles/storage.objectCreator" = [module.land-sa-cs-0.iam_email] + "roles/storage.objectViewer" = [module.processing-sa-cmp-0.iam_email] + "roles/storage.objectAdmin" = [module.processing-sa-dp-0.iam_email] + } +} + +module "land-project" { + source = "../../../modules/project" + parent = var.project_config.parent + billing_account = var.project_config.billing_account_id + project_create = var.project_config.billing_account_id != null + prefix = var.project_config.billing_account_id == null ? null : var.prefix + name = ( + var.project_config.billing_account_id == null + ? var.project_config.project_ids.landing + : "${var.project_config.project_ids.landing}${local.project_suffix}" + ) + iam = var.project_config.billing_account_id != null ? local.iam_lnd : null + iam_additive = var.project_config.billing_account_id == null ? local.iam_lnd : null + services = [ + "cloudkms.googleapis.com", + "cloudresourcemanager.googleapis.com", + "iam.googleapis.com", + "serviceusage.googleapis.com", + "stackdriver.googleapis.com", + "storage.googleapis.com", + "storage-component.googleapis.com", + ] + service_encryption_key_ids = { + bq = [var.service_encryption_keys.bq] + storage = [var.service_encryption_keys.storage] + } +} + +# Cloud Storage + +module "land-sa-cs-0" { + source = "../../../modules/iam-service-account" + project_id = module.land-project.project_id + prefix = var.prefix + name = "lnd-cs-0" + display_name = "Data platform GCS landing service account." + iam = { + "roles/iam.serviceAccountTokenCreator" = [ + local.groups_iam.data-engineers + ] + } +} + +module "land-cs-0" { + source = "../../../modules/gcs" + project_id = module.land-project.project_id + prefix = var.prefix + name = "lnd-cs-0" + location = var.location + storage_class = "MULTI_REGIONAL" + encryption_key = var.service_encryption_keys.storage + force_destroy = var.data_force_destroy +} diff --git a/blueprints/data-solutions/data-platform-minimal/02-composer.tf b/blueprints/data-solutions/data-platform-minimal/02-composer.tf new file mode 100644 index 0000000000..616d80ad00 --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/02-composer.tf @@ -0,0 +1,117 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tfdoc:file:description Cloud Composer resources. + +locals { + env_variables = { + BQ_LOCATION = var.location + CURATED_BQ_DATASET = module.cur-bq-0.dataset_id + CURATED_GCS = module.cur-cs-0.url + CURATED_PRJ = module.cur-project.project_id + DP_KMS_KEY = var.service_encryption_keys.compute + DP_REGION = var.region + GCP_REGION = var.region + LAND_PRJ = module.land-project.project_id + LAND_GCS = module.land-cs-0.name + PHS_CLUSTER_NAME = module.processing-dp-historyserver.name + PROCESSING_GCS = module.processing-cs-0.name + PROCESSING_PRJ = module.processing-project.project_id + PROCESSING_SA_DP = module.processing-sa-dp-0.email + PROCESSING_SUBNET = local.processing_subnet + PROCESSING_VPC = local.processing_vpc + } +} + +module "processing-sa-cmp-0" { + source = "../../../modules/iam-service-account" + project_id = module.processing-project.project_id + prefix = var.prefix + name = "prc-cmp-0" + display_name = "Data platform Composer service account" + iam = { + "roles/iam.serviceAccountTokenCreator" = [local.groups_iam.data-engineers] + "roles/iam.serviceAccountUser" = [module.processing-sa-cmp-0.iam_email] + } +} + +resource "google_composer_environment" "processing-cmp-0" { + count = var.composer_config.disable_deployment == true ? 0 : 1 + project = module.processing-project.project_id + name = "${var.prefix}-prc-cmp-0" + region = var.region + config { + software_config { + airflow_config_overrides = var.composer_config.software_config.airflow_config_overrides + pypi_packages = var.composer_config.software_config.pypi_packages + env_variables = merge( + var.composer_config.software_config.env_variables, local.env_variables + ) + image_version = var.composer_config.software_config.image_version + } + workloads_config { + scheduler { + cpu = var.composer_config.workloads_config.scheduler.cpu + memory_gb = var.composer_config.workloads_config.scheduler.memory_gb + storage_gb = var.composer_config.workloads_config.scheduler.storage_gb + count = var.composer_config.workloads_config.scheduler.count + } + web_server { + cpu = var.composer_config.workloads_config.web_server.cpu + memory_gb = var.composer_config.workloads_config.web_server.memory_gb + storage_gb = var.composer_config.workloads_config.web_server.storage_gb + } + worker { + cpu = var.composer_config.workloads_config.worker.cpu + memory_gb = var.composer_config.workloads_config.worker.memory_gb + storage_gb = var.composer_config.workloads_config.worker.storage_gb + min_count = var.composer_config.workloads_config.worker.min_count + max_count = var.composer_config.workloads_config.worker.max_count + } + } + + environment_size = var.composer_config.environment_size + + node_config { + network = local.processing_vpc + subnetwork = local.processing_subnet + service_account = module.processing-sa-cmp-0.email + enable_ip_masq_agent = true + tags = ["composer-worker"] + ip_allocation_policy { + cluster_secondary_range_name = var.network_config.composer_ip_ranges.pods_range_name + services_secondary_range_name = var.network_config.composer_ip_ranges.services_range_name + } + } + private_environment_config { + enable_private_endpoint = "true" + cloud_sql_ipv4_cidr_block = var.network_config.composer_ip_ranges.cloud_sql + master_ipv4_cidr_block = var.network_config.composer_ip_ranges.gke_master + cloud_composer_connection_subnetwork = var.network_config.composer_ip_ranges.connection_subnetwork + } + dynamic "encryption_config" { + for_each = ( + var.service_encryption_keys.composer != null + ? { 1 = 1 } + : {} + ) + content { + kms_key_name = var.service_encryption_keys.composer + } + } + } + depends_on = [ + module.processing-project + ] +} diff --git a/blueprints/data-solutions/data-platform-minimal/02-dataproc.tf b/blueprints/data-solutions/data-platform-minimal/02-dataproc.tf new file mode 100644 index 0000000000..1161abf018 --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/02-dataproc.tf @@ -0,0 +1,121 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tfdoc:file:description Cloud Dataproc resources. + +module "processing-cs-dp-history" { + source = "../../../modules/gcs" + project_id = module.processing-project.project_id + prefix = var.prefix + name = "prc-cs-dp-history" + location = var.region + storage_class = "REGIONAL" + encryption_key = var.service_encryption_keys.storage +} + +module "processing-sa-dp-0" { + source = "../../../modules/iam-service-account" + project_id = module.processing-project.project_id + prefix = var.prefix + name = "prc-dp-0" + display_name = "Dataproc service account" + iam = { + "roles/iam.serviceAccountTokenCreator" = [ + local.groups_iam.data-engineers, + module.processing-sa-cmp-0.iam_email + ], + "roles/iam.serviceAccountUser" = [ + module.processing-sa-cmp-0.iam_email + ] + } +} + +module "processing-dp-staging-0" { + source = "../../../modules/gcs" + project_id = module.processing-project.project_id + prefix = var.prefix + name = "prc-stg-0" + location = var.location + storage_class = "MULTI_REGIONAL" + encryption_key = var.service_encryption_keys.storage +} + +module "processing-dp-temp-0" { + source = "../../../modules/gcs" + project_id = module.processing-project.project_id + prefix = var.prefix + name = "prc-tmp-0" + location = var.location + storage_class = "MULTI_REGIONAL" + encryption_key = var.service_encryption_keys.storage +} + +module "processing-dp-log-0" { + source = "../../../modules/gcs" + project_id = module.processing-project.project_id + prefix = var.prefix + name = "prc-log-0" + location = var.location + storage_class = "MULTI_REGIONAL" + encryption_key = var.service_encryption_keys.storage +} + +module "processing-dp-historyserver" { + source = "../../../modules/dataproc" + project_id = module.processing-project.project_id + name = "hystory-server" + prefix = var.prefix + region = var.region + dataproc_config = { + cluster_config = { + staging_bucket = module.processing-dp-staging-0.name + temp_bucket = module.processing-dp-temp-0.name + gce_cluster_config = { + subnetwork = module.processing-vpc[0].subnets["${var.region}/${var.prefix}-processing"].self_link + zone = "${var.region}-b" + service_account = module.processing-sa-dp-0.email + service_account_scopes = ["cloud-platform"] + internal_ip_only = true + } + worker_config = { + num_instances = 0 + machine_type = null + min_cpu_platform = null + image_uri = null + } + software_config = { + override_properties = { + "dataproc:dataproc.allow.zero.workers" = "true" + "dataproc:job.history.to-gcs.enabled" = "true" + "spark:spark.history.fs.logDirectory" = ( + "gs://${module.processing-dp-staging-0.name}/*/spark-job-history" + ) + "spark:spark.eventLog.dir" = ( + "gs://${module.processing-dp-staging-0.name}/*/spark-job-history" + ) + "spark:spark.history.custom.executor.log.url.applyIncompleteApplication" = "false" + "spark:spark.history.custom.executor.log.url" = ( + "{{YARN_LOG_SERVER_URL}}/{{NM_HOST}}:{{NM_PORT}}/{{CONTAINER_ID}}/{{CONTAINER_ID}}/{{USER}}/{{FILE_NAME}}" + ) + } + } + endpoint_config = { + enable_http_port_access = "true" + } + encryption_config = { + kms_key_name = var.service_encryption_keys.compute + } + } + } +} diff --git a/blueprints/data-solutions/data-platform-minimal/02-processing.tf b/blueprints/data-solutions/data-platform-minimal/02-processing.tf new file mode 100644 index 0000000000..9bbb623430 --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/02-processing.tf @@ -0,0 +1,160 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tfdoc:file:description Processing project and VPC. + +locals { + iam_processing = { + "roles/composer.admin" = [local.groups_iam.data-engineers] + "roles/composer.environmentAndStorageObjectAdmin" = [local.groups_iam.data-engineers] + "roles/composer.ServiceAgentV2Ext" = [ + "serviceAccount:${module.processing-project.service_accounts.robots.composer}" + ] + "roles/composer.worker" = [ + module.processing-sa-cmp-0.iam_email + ] + "roles/dataproc.editor" = [ + module.processing-sa-cmp-0.iam_email + ] + "roles/dataproc.worker" = [ + module.processing-sa-dp-0.iam_email + ] + "roles/iam.serviceAccountUser" = [ + module.processing-sa-cmp-0.iam_email, local.groups_iam.data-engineers + ] + "roles/iap.httpsResourceAccessor" = [local.groups_iam.data-engineers] + "roles/serviceusage.serviceUsageConsumer" = [local.groups_iam.data-engineers] + "roles/storage.admin" = [ + module.processing-sa-cmp-0.iam_email, + "serviceAccount:${module.processing-project.service_accounts.robots.composer}", + local.groups_iam.data-engineers + ] + } + processing_subnet = ( + local.use_shared_vpc + ? var.network_config.subnet_self_links.processingestration + : module.processing-vpc.0.subnet_self_links["${var.region}/${var.prefix}-processing"] + ) + processing_vpc = ( + local.use_shared_vpc + ? var.network_config.network_self_link + : module.processing-vpc.0.self_link + ) + + +} + +module "processing-project" { + source = "../../../modules/project" + parent = var.project_config.parent + billing_account = var.project_config.billing_account_id + project_create = var.project_config.billing_account_id != null + prefix = var.project_config.billing_account_id == null ? null : var.prefix + name = ( + var.project_config.billing_account_id == null + ? var.project_config.project_ids.processing + : "${var.project_config.project_ids.processing}${local.project_suffix}" + ) + iam = var.project_config.billing_account_id != null ? local.iam_processing : null + iam_additive = var.project_config.billing_account_id == null ? local.iam_processing : null + oslogin = false + services = [ + "bigquery.googleapis.com", + "bigqueryreservation.googleapis.com", + "bigquerystorage.googleapis.com", + "cloudkms.googleapis.com", + "cloudresourcemanager.googleapis.com", + "composer.googleapis.com", + "compute.googleapis.com", + "container.googleapis.com", + "dataproc.googleapis.com", + "iam.googleapis.com", + "servicenetworking.googleapis.com", + "serviceusage.googleapis.com", + "stackdriver.googleapis.com", + "storage.googleapis.com", + "storage-component.googleapis.com" + ] + service_encryption_key_ids = { + composer = [var.service_encryption_keys.composer] + compute = [var.service_encryption_keys.compute] + storage = [var.service_encryption_keys.storage] + } + shared_vpc_service_config = var.network_config.host_project == null ? null : { + attach = true + host_project = var.network_config.host_project + service_identity_iam = { + "roles/compute.networkUser" = [ + "cloudservices", "compute", "container-engine" + ] + "roles/composer.sharedVpcAgent" = [ + "composer" + ] + "roles/container.hostServiceAgentUser" = [ + "container-egine" + ] + } + } +} + +# Cloud Storage + +module "processing-cs-0" { + source = "../../../modules/gcs" + project_id = module.processing-project.project_id + prefix = var.prefix + name = "prc-cs-0" + location = var.location + storage_class = "MULTI_REGIONAL" + encryption_key = var.service_encryption_keys.storage +} + +# internal VPC resources + +module "processing-vpc" { + source = "../../../modules/net-vpc" + count = local.use_shared_vpc ? 0 : 1 + project_id = module.processing-project.project_id + name = "${var.prefix}-processing" + subnets = [ + { + ip_cidr_range = "10.10.0.0/24" + name = "${var.prefix}-processing" + region = var.region + secondary_ip_ranges = { + pods = "10.10.8.0/22" + services = "10.10.12.0/24" + } + } + ] +} + +module "processing-vpc-firewall" { + source = "../../../modules/net-vpc-firewall" + count = local.use_shared_vpc ? 0 : 1 + project_id = module.processing-project.project_id + network = module.processing-vpc.0.name + default_rules_config = { + admin_ranges = ["10.10.0.0/24"] + } +} + +module "processing-nat" { + count = local.use_shared_vpc ? 0 : 1 + source = "../../../modules/net-cloudnat" + project_id = module.processing-project.project_id + name = "${var.prefix}-processing" + region = var.region + router_network = module.processing-vpc.0.name +} diff --git a/blueprints/data-solutions/data-platform-minimal/03-curated.tf b/blueprints/data-solutions/data-platform-minimal/03-curated.tf new file mode 100644 index 0000000000..5b044b51ef --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/03-curated.tf @@ -0,0 +1,99 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tfdoc:file:description Data curated project and resources. + +locals { + cur_iam = { + "roles/bigquery.dataOwner" = [module.processing-sa-dp-0.iam_email] + "roles/bigquery.dataViewer" = [ + local.groups_iam.data-analysts, + local.groups_iam.data-engineers + ] + "roles/bigquery.jobUser" = [ + module.processing-sa-dp-0.iam_email, + local.groups_iam.data-analysts, + local.groups_iam.data-engineers + ] + "roles/datacatalog.tagTemplateViewer" = [ + local.groups_iam.data-analysts, local.groups_iam.data-engineers + ] + "roles/datacatalog.viewer" = [ + local.groups_iam.data-analysts, local.groups_iam.data-engineers + ] + "roles/storage.objectViewer" = [ + local.groups_iam.data-analysts, local.groups_iam.data-engineers + ] + "roles/storage.objectAdmin" = [module.processing-sa-dp-0.iam_email] + } + cur_services = [ + "iam.googleapis.com", + "bigquery.googleapis.com", + "bigqueryreservation.googleapis.com", + "bigquerystorage.googleapis.com", + "cloudkms.googleapis.com", + "cloudresourcemanager.googleapis.com", + "compute.googleapis.com", + "servicenetworking.googleapis.com", + "serviceusage.googleapis.com", + "stackdriver.googleapis.com", + "storage.googleapis.com", + "storage-component.googleapis.com" + ] +} + +# Project + +module "cur-project" { + source = "../../../modules/project" + parent = var.project_config.parent + billing_account = var.project_config.billing_account_id + project_create = var.project_config.billing_account_id != null + prefix = var.project_config.billing_account_id == null ? null : var.prefix + name = ( + var.project_config.billing_account_id == null + ? var.project_config.project_ids.curated + : "${var.project_config.project_ids.curated}${local.project_suffix}" + ) + iam = var.project_config.billing_account_id != null ? local.cur_iam : {} + iam_additive = var.project_config.billing_account_id == null ? local.cur_iam : {} + services = local.cur_services + service_encryption_key_ids = { + bq = [var.service_encryption_keys.bq] + storage = [var.service_encryption_keys.storage] + } +} + +# Bigquery + +module "cur-bq-0" { + source = "../../../modules/bigquery-dataset" + project_id = module.cur-project.project_id + id = "${replace(var.prefix, "-", "_")}_cur_bq_0" + location = var.location + encryption_key = var.service_encryption_keys.bq +} + +# Cloud storage + +module "cur-cs-0" { + source = "../../../modules/gcs" + project_id = module.cur-project.project_id + prefix = var.prefix + name = "cur-cs-0" + location = var.location + storage_class = "MULTI_REGIONAL" + encryption_key = var.service_encryption_keys.storage + force_destroy = var.data_force_destroy +} diff --git a/blueprints/data-solutions/data-platform-minimal/04-common.tf b/blueprints/data-solutions/data-platform-minimal/04-common.tf new file mode 100644 index 0000000000..3a2d01bdf0 --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/04-common.tf @@ -0,0 +1,67 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tfdoc:file:description Common project and resources. + +locals { + iam_common = { + "roles/dlp.admin" = [local.groups_iam.data-security] + "roles/dlp.estimatesAdmin" = [local.groups_iam.data-engineers] + "roles/dlp.reader" = [local.groups_iam.data-engineers] + "roles/dlp.user" = [ + module.processing-sa-dp-0.iam_email, + local.groups_iam.data-engineers + ] + "roles/datacatalog.admin" = [local.groups_iam.data-security] + "roles/datacatalog.viewer" = [ + module.processing-sa-dp-0.iam_email, + local.groups_iam.data-analysts + ] + "roles/datacatalog.categoryFineGrainedReader" = [ + module.processing-sa-dp-0.iam_email + ] + } +} +module "common-project" { + source = "../../../modules/project" + parent = var.project_config.parent + billing_account = var.project_config.billing_account_id + project_create = var.project_config.billing_account_id != null + prefix = var.project_config.billing_account_id == null ? null : var.prefix + name = ( + var.project_config.billing_account_id == null + ? var.project_config.project_ids.common + : "${var.project_config.project_ids.common}${local.project_suffix}" + ) + iam = var.project_config.billing_account_id != null ? local.iam_common : null + iam_additive = var.project_config.billing_account_id == null ? local.iam_common : null + services = [ + "cloudresourcemanager.googleapis.com", + "datacatalog.googleapis.com", + "dlp.googleapis.com", + "iam.googleapis.com", + "serviceusage.googleapis.com", + "stackdriver.googleapis.com", + ] +} + +# Data Catalog Policy tag + +module "common-datacatalog" { + source = "../../../modules/data-catalog-policy-tag" + project_id = module.common-project.project_id + name = "${var.prefix}-datacatalog-policy-tags" + location = var.location + tags = var.data_catalog_tags +} diff --git a/blueprints/data-solutions/data-platform-minimal/IAM.md b/blueprints/data-solutions/data-platform-minimal/IAM.md new file mode 100644 index 0000000000..54bde92d50 --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/IAM.md @@ -0,0 +1,39 @@ +# IAM bindings reference + +Legend: + additive, conditional. + +## Project cmn + +| members | roles | +|---|---| +|gcp-data-analysts
group|[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer) | +|gcp-data-engineers
group|[roles/dlp.estimatesAdmin](https://cloud.google.com/iam/docs/understanding-roles#dlp.estimatesAdmin)
[roles/dlp.reader](https://cloud.google.com/iam/docs/understanding-roles#dlp.reader)
[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | +|gcp-data-security
group|[roles/datacatalog.admin](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.admin)
[roles/dlp.admin](https://cloud.google.com/iam/docs/understanding-roles#dlp.admin) | +|prc-dp-0
serviceAccount|[roles/datacatalog.categoryFineGrainedReader](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.categoryFineGrainedReader)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/dlp.user](https://cloud.google.com/iam/docs/understanding-roles#dlp.user) | + +## Project cur + +| members | roles | +|---|---| +|gcp-data-analysts
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|gcp-data-engineers
group|[roles/bigquery.dataViewer](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataViewer)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/datacatalog.tagTemplateViewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.tagTemplateViewer)
[roles/datacatalog.viewer](https://cloud.google.com/iam/docs/understanding-roles#datacatalog.viewer)
[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| +|prc-dp-0
serviceAccount|[roles/bigquery.dataOwner](https://cloud.google.com/iam/docs/understanding-roles#bigquery.dataOwner)
[roles/bigquery.jobUser](https://cloud.google.com/iam/docs/understanding-roles#bigquery.jobUser)
[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | + +## Project lnd + +| members | roles | +|---|---| +|lnd-cs-0
serviceAccount|[roles/storage.objectCreator](https://cloud.google.com/iam/docs/understanding-roles#storage.objectCreator) | +|prc-cmp-0
serviceAccount|[roles/storage.objectViewer](https://cloud.google.com/iam/docs/understanding-roles#storage.objectViewer) | +|prc-dp-0
serviceAccount|[roles/storage.objectAdmin](https://cloud.google.com/iam/docs/understanding-roles#storage.objectAdmin) | + +## Project prc + +| members | roles | +|---|---| +|gcp-data-engineers
group|[roles/composer.admin](https://cloud.google.com/iam/docs/understanding-roles#composer.admin)
[roles/composer.environmentAndStorageObjectAdmin](https://cloud.google.com/iam/docs/understanding-roles#composer.environmentAndStorageObjectAdmin)
[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser)
[roles/iap.httpsResourceAccessor](https://cloud.google.com/iam/docs/understanding-roles#iap.httpsResourceAccessor)
[roles/serviceusage.serviceUsageConsumer](https://cloud.google.com/iam/docs/understanding-roles#serviceusage.serviceUsageConsumer)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | +|SERVICE_IDENTITY_cloudcomposer-accounts
serviceAccount|[roles/composer.ServiceAgentV2Ext](https://cloud.google.com/iam/docs/understanding-roles#composer.ServiceAgentV2Ext)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | +|SERVICE_IDENTITY_service-networking
serviceAccount|[roles/servicenetworking.serviceAgent](https://cloud.google.com/iam/docs/understanding-roles#servicenetworking.serviceAgent) +| +|prc-cmp-0
serviceAccount|[roles/composer.worker](https://cloud.google.com/iam/docs/understanding-roles#composer.worker)
[roles/dataproc.editor](https://cloud.google.com/iam/docs/understanding-roles#dataproc.editor)
[roles/iam.serviceAccountUser](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser)
[roles/storage.admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) | +|prc-dp-0
serviceAccount|[roles/dataproc.worker](https://cloud.google.com/iam/docs/understanding-roles#dataproc.worker) | diff --git a/blueprints/data-solutions/data-platform-minimal/README.md b/blueprints/data-solutions/data-platform-minimal/README.md new file mode 100644 index 0000000000..a1d60f1c60 --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/README.md @@ -0,0 +1,308 @@ +# Minimal Data Platform + +This module implements a minimal opinionated Data Platform Architecture based on Dataproc Serverless resources. It creates and sets up projects and related resources that compose an end-to-end data environment. + +This minimal Data Platform Architecture keep to a minimal set of projects the solution. The approach make the architecture easy to read and operate but limit the ability to scale to handle multiple worklaods. To better handle more complex use cases where workloads need processing role segmentation betwneed transformations or deeper cost attribution are needed, it is suggested to refer to the [Data Platform](../data-platform-foundations/) blueprint. + +The code is intentionally simple, as it's intended to provide a generic initial setup and then allow easy customizations to complete the implementation of the intended design. + +The following diagram is a high-level reference of the resources created and managed here: + +![Data Platform architecture overview](./images/diagram.png "Data Platform architecture overview") + +A demo [Airflow pipeline](demo/orchestrate_pyspark.py) is also part of this blueprint: it can be built and run on top of the foundational infrastructure to verify or test the setup quickly. + +## Design overview and choices + +Despite its simplicity, this stage implements the basics of a design that we've seen working well for various customers. + +The approach adapts to different high-level requirements: + +- boundaries for each step +- clearly defined actors +- least privilege principle +- rely on service account impersonation + +The code in this blueprint doesn't address Organization-level configurations (Organization policy, VPC-SC, centralized logs). We expect those elements to be managed by automation stages external to this script like those in [FAST](../../../fast) and this blueprint deployed on top of them as one of the [stages](../../../fast/stages/3-data-platform/dev/README.md). + +## Project structure + +The Data Platform is designed to rely on several projects, one project per data stage. The stages identified are: + +- landing +- processing +- curated +- common + +This separation into projects allows adhering to the least-privilege principle by using project-level roles. + +The script will create the following projects: + +- **Landing** Data, stored in relevant formats. Structured data can be stored in BigQuery or in GCS using an appropriate file format such as AVRO or Parquet. Unstructured data stored on Cloud Storage. +- **Processing** Used to host all resources needed to process and orchestrate data movement. Cloud Composer orchestrates all tasks that move data across layers. Cloud Dataproc Serveless process and move data between layers. Anonymization or tokenization of Personally Identifiable Information (PII) can be implemented here using Cloud DLP or a custom solution, depending on your requirements. +- **Curated** Cleansed, aggregated and curated data. +- **Common** Common services such as [Cloud DLP](https://cloud.google.com/dlp) or [Data Catalog](https://cloud.google.com/data-catalog/docs/concepts/overview). + +## Roles + +We assign roles on resources at the project level, granting the appropriate roles via groups (humans) and service accounts (services and applications) according to best practices. + +## Service accounts + +Service account creation follows the least privilege principle, performing a single task which requires access to a defined set of resources. The table below shows a high level overview of roles for each service account on each data layer, using READ or WRITE access patterns for simplicity. + +A full reference of IAM roles managed by the Data Platform is [available here](IAM.md). + +For detailed roles please refer to the code. + +Using of service account keys within a data pipeline exposes to several security risks deriving from a credentials leak. This blueprint shows how to leverage impersonation to avoid the need of creating keys. + +## User groups + +User groups provide a stable frame of reference that allows decoupling the final set of permissions from the stage where entities and resources are created, and their IAM bindings defined. + +We use three groups to control access to resources: + +- *Data Engineers* They handle and run the Data Hub, with read access to all resources in order to troubleshoot possible issues with pipelines. This team can also impersonate any service account. +- *Data Analysts*. They perform analysis on datasets, with read access to the Data Warehouse Confidential project, and BigQuery READ/WRITE access to the playground project. +- *Data Security*:. They handle security configurations related to the Data Hub. This team has admin access to the common project to configure Cloud DLP templates or Data Catalog policy tags. + +### Virtual Private Cloud (VPC) design + +As is often the case in real-world configurations, this blueprint accepts as input an existing [Shared-VPC](https://cloud.google.com/vpc/docs/shared-vpc) via the `network_config` variable. Make sure that the GKE API (`container.googleapis.com`) is enabled in the VPC host project. + +If the `network_config` variable is not provided, one VPC will be created in each project that supports network resources (load, transformation and orchestration). + +### IP ranges and subnetting + +To deploy this blueprint with self-managed VPCs you need the following ranges: + +- one /24 for the processing project VPC subnet used for Cloud Dataproc workers +- one /24 range for the orchestration VPC subnet used for Composer workers +- one /22 and one /24 ranges for the secondary ranges associated with the orchestration VPC subnet + +If you are using Shared VPC, you need one subnet with one /22 and one /24 secondary range defined for Composer pods and services. + +In both VPC scenarios, you also need these ranges for Composer: + +- one /24 for Cloud SQL +- one /28 for the GKE control plane + +### Resource naming conventions + +Resources follow the naming convention described below. + +- `prefix-layer` for projects +- `prefix-layer-product` for resources +- `prefix-layer[2]-gcp-product[2]-counter` for services and service accounts + +### Encryption + +We suggest a centralized approach to key management, where Organization Security is the only team that can access encryption material, and keyrings and keys are managed in a project external to the Data Platform. + +![Centralized Cloud Key Management high-level diagram](./images/kms_diagram.png "Centralized Cloud Key Management high-level diagram") + +To configure the use of Cloud KMS on resources, you have to specify the key id on the `service_encryption_keys` variable. Key locations should match resource locations. Example: + +```tfvars +service_encryption_keys = { + bq = "KEY_URL" + composer = "KEY_URL" + compute = "KEY_URL" + storage = "KEY_URL" +} +``` + +This step is optional and depends on customer policies and security best practices. + +## Data Anonymization + +We suggest using Cloud Data Loss Prevention to identify/mask/tokenize your confidential data. + +While implementing a Data Loss Prevention strategy is out of scope for this blueprint, we enable the service in two different projects so that [Cloud Data Loss Prevention templates](https://cloud.google.com/dlp/docs/concepts-templates) can be configured in one of two ways: + +- during the ingestion phase, from Cloud Dataproc +- within the curated layer, in [BigQuery](https://cloud.google.com/bigquery/docs/scan-with-dlp) or [Cloud Dataproc](https://cloud.google.com/dataproct) + +Cloud Data Loss Prevention resources and templates should be stored in the Common project: + +![Centralized Cloud Data Loss Prevention high-level diagram](./images/dlp_diagram.png "Centralized Cloud Data Loss Prevention high-level diagram") + +You can find more details and best practices on using DLP to De-identification and re-identification of PII in large-scale datasets in the [GCP documentation](https://cloud.google.com/architecture/de-identification-re-identification-pii-using-cloud-dlp). + +## Data Catalog + +[Data Catalog](https://cloud.google.com/data-catalog) helps you to document your data entry at scale. Data Catalog relies on [tags](https://cloud.google.com/data-catalog/docs/tags-and-tag-templates#tags) and [tag template](https://cloud.google.com/data-catalog/docs/tags-and-tag-templates#tag-templates) to manage metadata for all data entries in a unified and centralized service. To implement [column-level security](https://cloud.google.com/bigquery/docs/column-level-security-intro) on BigQuery, we suggest to use `Tags` and `Tag templates`. + +The default configuration will implement 3 tags: + +- `3_Confidential`: policy tag for columns that include very sensitive information, such as credit card numbers. +- `2_Private`: policy tag for columns that include sensitive personal identifiable information (PII) information, such as a person's first name. +- `1_Sensitive`: policy tag for columns that include data that cannot be made public, such as the credit limit. + +Anything that is not tagged is available to all users who have access to the data warehouse. + +For the purpose of the blueprint no groups has access to tagged data. You can configure your tags and roles associated by configuring the `data_catalog_tags` variable. We suggest using the "[Best practices for using policy tags in BigQuery](https://cloud.google.com/bigquery/docs/best-practices-policy-tags)" article as a guide to designing your tags structure and access pattern. + +## How to run this script + +To deploy this blueprint on your GCP organization, you will need + +- a folder or organization where new projects will be created +- a billing account that will be associated with the new projects + +The Data Platform is meant to be executed by a Service Account (or a regular user) having this minimal set of permission: + +- **Billing account** + - `roles/billing.user` +- **Folder level**: + - `roles/resourcemanager.folderAdmin` + - `roles/resourcemanager.projectCreator` +- **KMS Keys** (If CMEK encryption in use): + - `roles/cloudkms.admin` or a custom role with `cloudkms.cryptoKeys.getIamPolicy`, `cloudkms.cryptoKeys.list`, `cloudkms.cryptoKeys.setIamPolicy` permissions +- **Shared VPC host project** (if configured):\ + - `roles/compute.xpnAdmin` on the host project folder or org + - `roles/resourcemanager.projectIamAdmin` on the host project, either with no conditions or with a condition allowing [delegated role grants](https://medium.com/google-cloud/managing-gcp-service-usage-through-delegated-role-grants-a843610f2226#:~:text=Delegated%20role%20grants%20is%20a,setIamPolicy%20permission%20on%20a%20resource.) for `roles/compute.networkUser`, `roles/composer.sharedVpcAgent`, `roles/container.hostServiceAgentUser` + +## Variable configuration + +There are three sets of variables you will need to fill in: + +```tfvars +project_config = { + billing_account_id = "123456-123456-123456" + parent = "folders/12345678" +} +organization_domain = "domain.com" +prefix = "myprefix" +``` + +For more fine details check variables on [`variables.tf`](./variables.tf) and update according to the desired configuration. + +*Remember* to create team groups described [below](#groups). + +Once the configuration is complete, run the project factory by running + +```bash +terraform init +terraform apply +``` + +## How to use this blueprint from Terraform + +While this blueprint can be used as a standalone deployment, it can also be called directly as a Terraform module by providing the variables values as show below: + +```hcl +module "data-platform" { + source = "./fabric/blueprints/data-solutions/data-platform-minimal/" + organization_domain = "example.com" + project_config = { + billing_account_id = "123456-123456-123456" + parent = "folders/12345678" + } + prefix = "myprefix" +} + +# tftest modules=21 resources=110 +``` + +## Customizations + +### Assign roles at BQ Dataset level + +To handle multiple groups of `data-analysts` accessing the same Data Warehouse layer projects but only to the dataset belonging to a specific group, you may want to assign roles at BigQuery dataset level instead of at project-level. +To do this, you need to remove IAM binging at project-level for the `data-analysts` group and give roles at BigQuery dataset level using the `iam` variable on `bigquery-dataset` modules. + +### Project Configuration + +The solution can be deployed by creating projects on a given parent (organization or folder) or on existing projects. Configure variable `project_config` accordingly. + +When you deploy the blueprint on existing projects, the blueprint is designed to rely on different projects configuring IAM binding with an additive approach. + +Once you have identified the required project granularity for your use case, we suggest adapting the terraform script accordingly and relying on authoritative IAM binding. + +### Shared VPC + +To configure the use of a shared VPC, configure the `network_config`, example: + +```tfvars +network_config = { + host_project = "PROJECT_ID" + network_self_link = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/global/networks/NAME" + subnet_self_links = { + processing_dataproc = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/regions/REGION/subnetworks/NAME" + processing_composer = "https://www.googleapis.com/compute/v1/projects/PROJECT_ID/regions/REGION/subnetworks/NAME" + } + composer_ip_ranges = { + cloudsql = "192.168.XXX.XXX/24" + gke_master = "192.168.XXX.XXX/28" + } + composer_secondary_ranges = { + pods = "pods" + services = "services" + } +} +``` + +### Customer Managed Encryption key + +To configure the use of Cloud KMS on resources, configure the `service_encryption_keys` variable. Key locations should match resource locations. Example: + +```tfvars +service_encryption_keys = { + bq = "KEY_URL" + composer = "KEY_URL" + compute = "KEY_URL" + storage = "KEY_URL" +} +``` + +## Demo pipeline + +The application layer is out of scope of this script. As a demo purpuse only, one Cloud Composer DAGs is provided to document how to deploy a Cloud Dataproc Serverless job on the architecture. You can find examples in the `[demo](./demo)` folder. + +## Files + +| name | description | modules | resources | +|---|---|---|---| +| [01-landing.tf](./01-landing.tf) | Landing project and resources. | gcs · iam-service-account · project | | +| [02-composer.tf](./02-composer.tf) | Cloud Composer resources. | iam-service-account | google_composer_environment | +| [02-dataproc.tf](./02-dataproc.tf) | Cloud Dataproc resources. | dataproc · gcs · iam-service-account | | +| [02-processing.tf](./02-processing.tf) | Processing project and VPC. | gcs · net-cloudnat · net-vpc · net-vpc-firewall · project | | +| [03-curated.tf](./03-curated.tf) | Data curated project and resources. | bigquery-dataset · gcs · project | | +| [04-common.tf](./04-common.tf) | Common project and resources. | data-catalog-policy-tag · project | | +| [main.tf](./main.tf) | Core locals. | | google_project_iam_member | +| [outputs.tf](./outputs.tf) | Output variables. | | | +| [variables.tf](./variables.tf) | Terraform Variables. | | | + + +## Variables + +| name | description | type | required | default | +|---|---|:---:|:---:|:---:| +| [organization_domain](variables.tf#L114) | Organization domain. | string | ✓ | | +| [prefix](variables.tf#L119) | Prefix used for resource names. | string | ✓ | | +| [project_config](variables.tf#L128) | Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format. | object({…}) | ✓ | | +| [composer_config](variables.tf#L17) | Cloud Composer config. | object({…}) | | {} | +| [data_catalog_tags](variables.tf#L55) | List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format. | map(map(list(string))) | | {…} | +| [data_force_destroy](variables.tf#L66) | Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage. | bool | | false | +| [groups](variables.tf#L72) | User groups. | map(string) | | {…} | +| [location](variables.tf#L82) | Location used for multi-regional resources. | string | | "eu" | +| [network_config](variables.tf#L88) | Shared VPC network configurations to use. If null networks will be created in projects. | object({…}) | | {} | +| [project_suffix](variables.tf#L152) | Suffix used only for project ids. | string | | null | +| [region](variables.tf#L158) | Region used for regional resources. | string | | "europe-west1" | +| [service_encryption_keys](variables.tf#L164) | Cloud KMS to use to encrypt different services. Key location should match service region. | object({…}) | | {} | + +## Outputs + +| name | description | sensitive | +|---|---|:---:| +| [bigquery-datasets](outputs.tf#L17) | BigQuery datasets. | | +| [dataproc-hystory-server](outputs.tf#L24) | List of bucket names which have been assigned to the cluster. | | +| [gcs-buckets](outputs.tf#L34) | GCS buckets. | ✓ | +| [kms_keys](outputs.tf#L44) | Cloud MKS keys. | | +| [projects](outputs.tf#L49) | GCP Projects informations. | | +| [vpc_network](outputs.tf#L67) | VPC network. | | +| [vpc_subnet](outputs.tf#L75) | VPC subnetworks. | | + + diff --git a/blueprints/data-solutions/data-platform-minimal/demo/orchestrate_pyspark.py b/blueprints/data-solutions/data-platform-minimal/demo/orchestrate_pyspark.py new file mode 100644 index 0000000000..ef5084ffe2 --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/demo/orchestrate_pyspark.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import time +import os + +from airflow import models +from airflow.providers.google.cloud.operators.dataproc import ( + DataprocCreateBatchOperator, DataprocDeleteBatchOperator, DataprocGetBatchOperator, DataprocListBatchesOperator + +) +from airflow.utils.dates import days_ago + +# -------------------------------------------------------------------------------- +# Get variables +# -------------------------------------------------------------------------------- +BQ_LOCATION = os.environ.get("BQ_LOCATION") +CURATED_BQ_DATASET = os.environ.get("CURATED_BQ_DATASET") +CURATED_GCS = os.environ.get("CURATED_GCS") +CURATED_PRJ = os.environ.get("CURATED_PRJ") +DP_KMS_KEY = os.environ.get("DP_KMS_KEY", "") +DP_REGION = os.environ.get("DP_REGION") +GCP_REGION = os.environ.get("GCP_REGION") +LAND_PRJ = os.environ.get("LAND_PRJ") +LAND_BQ_DATASET = os.environ.get("LAND_BQ_DATASET") +LAND_GCS = os.environ.get("LAND_GCS") +PHS_CLUSTER_NAME = os.environ.get("PHS_CLUSTER_NAME") +PROCESSING_GCS = os.environ.get("PROCESSING_GCS") +PROCESSING_PRJ = os.environ.get("PROCESSING_PRJ") +PROCESSING_SA_DP = os.environ.get("PROCESSING_SA_DP") +PROCESSING_SA_SUBNET = os.environ.get("PROCESSING_SUBNET") +PROCESSING_SA_VPC = os.environ.get("PROCESSING_VPC") + +PYTHON_FILE_LOCATION = "gs://"+PROCESSING_GCS+"/pyspark_sort.py" +PHS_CLUSTER_PATH = "projects/"+PROCESSING_PRJ+"/regions/"+DP_REGION+"/clusters/"+PHS_CLUSTER_NAME + +default_args = { + # Tell airflow to start one day ago, so that it runs as soon as you upload it + "start_date": days_ago(1), + "region": DP_REGION, +} +with models.DAG( + "dataproc_batch_operators", # The id you will see in the DAG airflow page + default_args=default_args, # The interval with which to schedule the DAG + schedule_interval=None, # Override to match your needs +) as dag: + + create_batch = DataprocCreateBatchOperator( + task_id="batch_create", + project_id=PROCESSING_PRJ, + batch={ + "environment_config": { + "execution_config": { + "service_account": PROCESSING_SA_DP, + "subnetwork_uri": PROCESSING_SA_SUBNET + }, + "peripherals_config": { + "spark_history_server_config":{ + "dataproc_cluster": PHS_CLUSTER_PATH + } + } + }, + "pyspark_batch": { + "main_python_file_uri": PYTHON_FILE_LOCATION, + } + }, + batch_id="batch-create-phs-"+str(int(time.time())), + ) + + list_batches = DataprocListBatchesOperator( + task_id="list-all-batches", + ) + + get_batch = DataprocGetBatchOperator( + task_id="get_batch", + batch_id="batch-create-phs", + ) + + create_batch >> list_batches >> get_batch \ No newline at end of file diff --git a/blueprints/data-solutions/data-platform-minimal/demo/pyspark_sort.py b/blueprints/data-solutions/data-platform-minimal/demo/pyspark_sort.py new file mode 100644 index 0000000000..f8e5605301 --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/demo/pyspark_sort.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Sample pyspark script to be uploaded to Cloud Storage and run on +Cloud Dataproc. +Note this file is not intended to be run directly, but run inside a PySpark +environment. +""" + +# [START dataproc_pyspark_sort] +import pyspark + +sc = pyspark.SparkContext() +rdd = sc.parallelize(["Hello,", "world!", "dog", "elephant", "panther"]) +words = sorted(rdd.collect()) +print(words) +# [END dataproc_pyspark_sort] \ No newline at end of file diff --git a/blueprints/data-solutions/data-platform-minimal/images/diagram.png b/blueprints/data-solutions/data-platform-minimal/images/diagram.png new file mode 100644 index 0000000000..7f992cbcb5 Binary files /dev/null and b/blueprints/data-solutions/data-platform-minimal/images/diagram.png differ diff --git a/blueprints/data-solutions/data-platform-minimal/images/dlp_diagram.png b/blueprints/data-solutions/data-platform-minimal/images/dlp_diagram.png new file mode 100644 index 0000000000..76a49ef727 Binary files /dev/null and b/blueprints/data-solutions/data-platform-minimal/images/dlp_diagram.png differ diff --git a/blueprints/data-solutions/data-platform-minimal/images/kms_diagram.png b/blueprints/data-solutions/data-platform-minimal/images/kms_diagram.png new file mode 100644 index 0000000000..b973cabd18 Binary files /dev/null and b/blueprints/data-solutions/data-platform-minimal/images/kms_diagram.png differ diff --git a/blueprints/data-solutions/data-platform-minimal/main.tf b/blueprints/data-solutions/data-platform-minimal/main.tf new file mode 100644 index 0000000000..605e31f8bd --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/main.tf @@ -0,0 +1,26 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tfdoc:file:description Core locals. + +locals { + groups = { + for k, v in var.groups : k => "${v}@${var.organization_domain}" + } + groups_iam = { + for k, v in local.groups : k => "group:${v}" + } + project_suffix = var.project_suffix == null ? "" : "-${var.project_suffix}" + use_shared_vpc = var.network_config.host_project != null +} diff --git a/blueprints/data-solutions/data-platform-minimal/outputs.tf b/blueprints/data-solutions/data-platform-minimal/outputs.tf new file mode 100644 index 0000000000..97eda2a3f5 --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/outputs.tf @@ -0,0 +1,81 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tfdoc:file:description Output variables. + +output "bigquery-datasets" { + description = "BigQuery datasets." + value = { + curated = module.cur-bq-0.dataset_id, + } +} + +output "dataproc-hystory-server" { + description = "List of bucket names which have been assigned to the cluster." + value = { + bucket_names = module.processing-dp-historyserver.bucket_names + http_ports = module.processing-dp-historyserver.http_ports + instance_names = module.processing-dp-historyserver.instance_names + name = module.processing-dp-historyserver.name + } +} + +output "gcs-buckets" { + description = "GCS buckets." + sensitive = true + value = { + landing-cs-0 = module.land-sa-cs-0, + processing-cs-0 = module.processing-cs-0, + cur-cs-0 = module.cur-cs-0, + } +} + +output "kms_keys" { + description = "Cloud MKS keys." + value = var.service_encryption_keys +} + +output "projects" { + description = "GCP Projects informations." + value = { + project_number = { + landing = module.land-project.number, + common = module.common-project.number, + curated = module.cur-project.number, + processing = module.processing-project.number, + } + project_id = { + landing = module.land-project.project_id, + common = module.common-project.project_id, + curated = module.cur-project.project_id, + processing = module.processing-project.project_id, + } + } +} + +output "vpc_network" { + description = "VPC network." + value = { + processing_dataproc = local.processing_vpc + processing_composer = local.processing_vpc + } +} + +output "vpc_subnet" { + description = "VPC subnetworks." + value = { + processing_dataproc = local.processing_subnet + processing_composer = local.processing_subnet + } +} diff --git a/blueprints/data-solutions/data-platform-minimal/variables.tf b/blueprints/data-solutions/data-platform-minimal/variables.tf new file mode 100644 index 0000000000..a63f07c38f --- /dev/null +++ b/blueprints/data-solutions/data-platform-minimal/variables.tf @@ -0,0 +1,174 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# tfdoc:file:description Terraform Variables. + +variable "composer_config" { + description = "Cloud Composer config." + type = object({ + disable_deployment = optional(bool, false) + environment_size = optional(string, "ENVIRONMENT_SIZE_SMALL") + software_config = optional(object({ + airflow_config_overrides = optional(map(string), {}) + pypi_packages = optional(map(string), {}) + env_variables = optional(map(string), {}) + image_version = optional(string, "composer-2-airflow-2") + }), {}) + workloads_config = optional(object({ + scheduler = optional(object({ + cpu = optional(number, 0.5) + memory_gb = optional(number, 1.875) + storage_gb = optional(number, 1) + count = optional(number, 1) + } + ), {}) + web_server = optional(object({ + cpu = optional(number, 0.5) + memory_gb = optional(number, 1.875) + storage_gb = optional(number, 1) + }), {}) + worker = optional(object({ + cpu = optional(number, 0.5) + memory_gb = optional(number, 1.875) + storage_gb = optional(number, 1) + min_count = optional(number, 1) + max_count = optional(number, 3) + } + ), {}) + }), {}) + }) + nullable = false + default = {} +} + +variable "data_catalog_tags" { + description = "List of Data Catalog Policy tags to be created with optional IAM binging configuration in {tag => {ROLE => [MEMBERS]}} format." + type = map(map(list(string))) + nullable = false + default = { + "3_Confidential" = null + "2_Private" = null + "1_Sensitive" = null + } +} + +variable "data_force_destroy" { + description = "Flag to set 'force_destroy' on data services like BiguQery or Cloud Storage." + type = bool + default = false +} + +variable "groups" { + description = "User groups." + type = map(string) + default = { + data-analysts = "gcp-data-analysts" + data-engineers = "gcp-data-engineers" + data-security = "gcp-data-security" + } +} + +variable "location" { + description = "Location used for multi-regional resources." + type = string + default = "eu" +} + +variable "network_config" { + description = "Shared VPC network configurations to use. If null networks will be created in projects." + type = object({ + host_project = optional(string) + network_self_link = optional(string) + subnet_self_links = optional(object({ + processing_dataproc = string + processing_composer = string + }), null) + composer_ip_ranges = optional(object({ + connection_subnetwork = optional(string) + cloud_sql = optional(string, "10.20.10.0/24") + gke_master = optional(string, "10.20.11.0/28") + pods_range_name = optional(string, "pods") + services_range_name = optional(string, "services") + }), {}) + # web_server_network_access_control = list(string) + }) + nullable = false + default = {} + validation { + condition = (var.network_config.composer_ip_ranges.cloud_sql == null) != (var.network_config.composer_ip_ranges.connection_subnetwork == null) + error_message = "One, and only one, of `network_config.composer_ip_ranges.cloud_sql` or `network_config.composer_ip_ranges.connection_subnetwork` must be specified." + } +} + +variable "organization_domain" { + description = "Organization domain." + type = string +} + +variable "prefix" { + description = "Prefix used for resource names." + type = string + validation { + condition = var.prefix != "" + error_message = "Prefix cannot be empty." + } +} + +variable "project_config" { + description = "Provide 'billing_account_id' value if project creation is needed, uses existing 'project_ids' if null. Parent is in 'folders/nnn' or 'organizations/nnn' format." + type = object({ + billing_account_id = optional(string, null) + parent = string + project_ids = optional(object({ + landing = string + processing = string + curated = string + common = string + }), { + landing = "lnd" + processing = "prc" + curated = "cur" + common = "cmn" + } + ) + }) + validation { + condition = var.project_config.billing_account_id != null || var.project_config.project_ids != null + error_message = "At least one of project_config.billing_account_id or var.project_config.project_ids should be set." + } +} + +variable "project_suffix" { + description = "Suffix used only for project ids." + type = string + default = null +} + +variable "region" { + description = "Region used for regional resources." + type = string + default = "europe-west1" +} + +variable "service_encryption_keys" { + description = "Cloud KMS to use to encrypt different services. Key location should match service region." + type = object({ + bq = optional(string) + composer = optional(string) + compute = optional(string) + storage = optional(string) + }) + nullable = false + default = {} +}