diff --git a/modules/dataproc/README.md b/modules/dataproc/README.md index 2c8b2bdf20..79f3c4a8e9 100644 --- a/modules/dataproc/README.md +++ b/modules/dataproc/README.md @@ -14,6 +14,7 @@ This module Manages a Google Cloud [Dataproc](https://cloud.google.com/dataproc) - [Additive IAM](#additive-iam) - [Variables](#variables) - [Outputs](#outputs) +- [Fixtures](#fixtures) ## TODO @@ -25,39 +26,64 @@ This module Manages a Google Cloud [Dataproc](https://cloud.google.com/dataproc) ### Simple ```hcl -module "processing-dp-cluster-2" { +module "dataproc-cluster" { source = "./fabric/modules/dataproc" - project_id = "my-project" + project_id = var.project_id name = "my-cluster" - region = "europe-west1" + region = var.region } # tftest modules=1 resources=1 ``` ### Cluster configuration on GCE -To set cluster configuration use the 'dataproc_config.cluster_config' variable. +To set cluster configuration use the 'dataproc_config.cluster_config' variable. If you don't want to use dedicated service account, remember to grant `roles/dataproc.worker` to Compute Default Service Account. ```hcl +module "dataproc-service-account" { + source = "./fabric/modules/iam-service-account" + project_id = var.project_id + name = "dataproc-worker" + iam_project_roles = { + (var.project_id) = ["roles/dataproc.worker"] + } +} + +module "firewall" { + source = "./fabric/modules/net-vpc-firewall" + project_id = var.project_id + network = var.vpc.name + ingress_rules = { + allow-ingress-dataproc = { + description = "Allow all traffic between Dataproc nodes." + targets = ["dataproc"] + sources = ["dataproc"] + } + } +} + module "processing-dp-cluster" { source = "./fabric/modules/dataproc" - project_id = "my-project" + project_id = var.project_id name = "my-cluster" - region = "europe-west1" - prefix = "prefix" + region = var.region dataproc_config = { cluster_config = { gce_cluster_config = { - subnetwork = "https://www.googleapis.com/compute/v1/projects/PROJECT/regions/europe-west1/subnetworks/SUBNET" - zone = "europe-west1-b" - service_account = "" - service_account_scopes = ["cloud-platform"] internal_ip_only = true + service_account = module.dataproc-service-account.email + service_account_scopes = ["cloud-platform"] + subnetwork = var.subnet.self_link + tags = ["dataproc"] + zone = "${var.region}-b" } } } + depends_on = [ + module.dataproc-service-account, # ensure all grants are done before creating the cluster + ] } -# tftest modules=1 resources=1 +# tftest modules=3 resources=7 ``` ### Cluster configuration on GCE with CMEK encryption @@ -65,55 +91,104 @@ module "processing-dp-cluster" { To set cluster configuration use the Customer Managed Encryption key, set `dataproc_config.encryption_config.` variable. The Compute Engine service agent and the Cloud Storage service agent need to have `CryptoKey Encrypter/Decrypter` role on they configured KMS key ([Documentation](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/customer-managed-encryption)). ```hcl +module "dataproc-service-account" { + source = "./fabric/modules/iam-service-account" + project_id = var.project_id + name = "dataproc-worker" + iam_project_roles = { + (var.project_id) = ["roles/dataproc.worker", "roles/cloudkms.cryptoKeyEncrypterDecrypter"] + } +} + +module "firewall" { + source = "./fabric/modules/net-vpc-firewall" + project_id = var.project_id + network = var.vpc.name + ingress_rules = { + allow-ingress-dataproc = { + description = "Allow all traffic between Dataproc nodes." + targets = ["dataproc"] + sources = ["dataproc"] + } + } +} + module "processing-dp-cluster" { source = "./fabric/modules/dataproc" - project_id = "my-project" + project_id = var.project_id name = "my-cluster" - region = "europe-west1" - prefix = "prefix" + region = var.region dataproc_config = { cluster_config = { gce_cluster_config = { - subnetwork = "https://www.googleapis.com/compute/v1/projects/PROJECT/regions/europe-west1/subnetworks/SUBNET" - zone = "europe-west1-b" - service_account = "" - service_account_scopes = ["cloud-platform"] internal_ip_only = true + service_account = module.dataproc-service-account.email + service_account_scopes = ["cloud-platform"] + subnetwork = var.subnet.self_link + tags = ["dataproc"] + zone = "${var.region}-b" } } encryption_config = { - kms_key_name = "projects/project-id/locations/region/keyRings/key-ring-name/cryptoKeys/key-name" + kms_key_name = var.kms_key.id } } + depends_on = [ + module.dataproc-service-account, # ensure all grants are done before creating the cluster + ] } -# tftest modules=1 resources=1 +# tftest modules=3 resources=8 ``` ### Cluster configuration on GKE -To set cluster configuration GKE use the 'dataproc_config.virtual_cluster_config' variable. +To set cluster configuration GKE use the 'dataproc_config.virtual_cluster_config' variable. This example shows usage of [dedicated Service Account](https://cloud.google.com/dataproc/docs/guides/dpgke/dataproc-gke-iam#custom_iam_configuration). ```hcl +locals { + dataproc_namespace = "foobar" +} + +module "dataproc-service-account" { + source = "./fabric/modules/iam-service-account" + project_id = var.project_id + name = "dataproc-worker" + iam = { + "roles/iam.workloadIdentityUser" = [ + "serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/agent]", + "serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/spark-driver]", + "serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/spark-executor]" + ] + } + iam_project_roles = { + (var.project_id) = ["roles/dataproc.worker"] + } + depends_on = [ + module.gke-cluster-standard, # granting workloadIdentityUser requires cluster/pool to be created first + ] +} + module "processing-dp-cluster" { source = "./fabric/modules/dataproc" - project_id = "my-project" - name = "my-gke-cluster" - region = "europe-west1" - prefix = "prefix" + project_id = var.project_id + name = "my-dataproc-cluster" + region = var.region dataproc_config = { virtual_cluster_config = { kubernetes_cluster_config = { - kubernetes_namespace = "foobar" + kubernetes_namespace = local.dataproc_namespace kubernetes_software_config = { component_version = { - "SPARK" : "3.1-dataproc-7" + "SPARK" : "3.1-dataproc-14" } properties = { - "spark:spark.kubernetes.container.image" : "us-east4-docker.pkg.dev/cloud-dataproc/dpgke/sparkengine:dataproc-14" + "dataproc:dataproc.gke.agent.google-service-account" = module.dataproc-service-account.email + "dataproc:dataproc.gke.spark.driver.google-service-account" = module.dataproc-service-account.email + "dataproc:dataproc.gke.spark.executor.google-service-account" = module.dataproc-service-account.email } } gke_cluster_config = { - gke_cluster_target = "projects/my-project/locations/my-location/clusters/gke-cluster-name" + gke_cluster_target = module.gke-cluster-standard.id node_pool_target = { node_pool = "node-pool-name" roles = ["DEFAULT"] @@ -123,7 +198,7 @@ module "processing-dp-cluster" { } } } -# tftest modules=1 resources=1 +# tftest modules=4 resources=6 fixtures=fixtures/gke-cluster-standard.tf e2e ``` ## IAM @@ -143,10 +218,9 @@ Refer to the [project module](../project/README.md#iam) for examples of the IAM ```hcl module "processing-dp-cluster" { source = "./fabric/modules/dataproc" - project_id = "my-project" + project_id = var.project_id name = "my-cluster" - region = "europe-west1" - prefix = "prefix" + region = var.region iam_by_principals = { "group:gcp-data-engineers@example.net" = [ "roles/dataproc.viewer" @@ -166,10 +240,9 @@ module "processing-dp-cluster" { ```hcl module "processing-dp-cluster" { source = "./fabric/modules/dataproc" - project_id = "my-project" + project_id = var.project_id name = "my-cluster" - region = "europe-west1" - prefix = "prefix" + region = var.region iam_bindings_additive = { am1-viewer = { member = "user:am1@example.com" @@ -185,24 +258,23 @@ module "processing-dp-cluster" { | name | description | type | required | default | |---|---|:---:|:---:|:---:| | [name](variables.tf#L191) | Cluster name. | string | ✓ | | -| [project_id](variables.tf#L206) | Project ID. | string | ✓ | | -| [region](variables.tf#L211) | Dataproc region. | string | ✓ | | +| [project_id](variables.tf#L196) | Project ID. | string | ✓ | | +| [region](variables.tf#L201) | Dataproc region. | string | ✓ | | | [dataproc_config](variables.tf#L17) | Dataproc cluster config. | object({…}) | | {} | | [iam](variables-iam.tf#L24) | IAM bindings in {ROLE => [MEMBERS]} format. | map(list(string)) | | {} | | [iam_bindings](variables-iam.tf#L31) | Authoritative IAM bindings in {KEY => {role = ROLE, members = [], condition = {}}}. Keys are arbitrary. | map(object({…})) | | {} | | [iam_bindings_additive](variables-iam.tf#L46) | Individual additive IAM bindings. Keys are arbitrary. | map(object({…})) | | {} | | [iam_by_principals](variables-iam.tf#L17) | Authoritative IAM binding in {PRINCIPAL => [ROLES]} format. Principals need to be statically defined to avoid cycle errors. Merged internally with the `iam` variable. | map(list(string)) | | {} | | [labels](variables.tf#L185) | The resource labels for instance to use to annotate any related underlying resources, such as Compute Engine VMs. | map(string) | | {} | -| [prefix](variables.tf#L196) | Optional prefix used to generate project id and name. | string | | null | -| [service_account](variables.tf#L216) | Service account to set on the Dataproc cluster. | string | | null | ## Outputs | name | description | sensitive | |---|---|:---:| -| [bucket_names](outputs.tf#L19) | List of bucket names which have been assigned to the cluster. | | -| [http_ports](outputs.tf#L24) | The map of port descriptions to URLs. | | -| [id](outputs.tf#L29) | Fully qualified cluster id. | | -| [instance_names](outputs.tf#L34) | List of instance names which have been assigned to the cluster. | | -| [name](outputs.tf#L43) | The name of the cluster. | | +| [id](outputs.tf#L30) | Fully qualified cluster id. | | +| [name](outputs.tf#L45) | The name of the cluster. | | + +## Fixtures + +- [gke-cluster-standard.tf](../../tests/fixtures/gke-cluster-standard.tf) diff --git a/modules/dataproc/main.tf b/modules/dataproc/main.tf index 75fc4ccbf0..774988fb55 100644 --- a/modules/dataproc/main.tf +++ b/modules/dataproc/main.tf @@ -16,12 +16,8 @@ # tfdoc:file:description Cloud Dataproc resource definition. -locals { - prefix = var.prefix == null ? "" : "${var.prefix}-" -} - resource "google_dataproc_cluster" "cluster" { - name = "${local.prefix}${var.name}" + name = var.name project = var.project_id region = var.region graceful_decommission_timeout = var.dataproc_config.graceful_decommission_timeout diff --git a/modules/dataproc/outputs.tf b/modules/dataproc/outputs.tf index 51edb807d4..ae195c8b4d 100644 --- a/modules/dataproc/outputs.tf +++ b/modules/dataproc/outputs.tf @@ -16,29 +16,31 @@ # tfdoc:file:description Cloud Dataproc module output. -output "bucket_names" { - description = "List of bucket names which have been assigned to the cluster." - value = google_dataproc_cluster.cluster.cluster_config.0.bucket -} - -output "http_ports" { - description = "The map of port descriptions to URLs." - value = google_dataproc_cluster.cluster.cluster_config.0.endpoint_config.0.http_ports -} +# FIXME: 2024-03-08: broken in provider +#output "bucket_names" { +# description = "List of bucket names which have been assigned to the cluster." +# value = google_dataproc_cluster.cluster.cluster_config.0.bucket +#} +# +#output "http_ports" { +# description = "The map of port descriptions to URLs." +# value = google_dataproc_cluster.cluster.cluster_config.0.endpoint_config.0.http_ports +#} output "id" { description = "Fully qualified cluster id." value = google_dataproc_cluster.cluster.id } -output "instance_names" { - description = "List of instance names which have been assigned to the cluster." - value = { - master = google_dataproc_cluster.cluster.cluster_config.0.master_config.0.instance_names - worker = google_dataproc_cluster.cluster.cluster_config.0.worker_config.0.instance_names - preemptible_worker = google_dataproc_cluster.cluster.cluster_config.0.preemptible_worker_config.0.instance_names - } -} +# FIXME: 2024-03-08: broken in provider +#output "instance_names" { +# description = "List of instance names which have been assigned to the cluster." +# value = { +# master = google_dataproc_cluster.cluster.cluster_config.0.master_config.0.instance_names +# worker = google_dataproc_cluster.cluster.cluster_config.0.worker_config.0.instance_names +# preemptible_worker = google_dataproc_cluster.cluster.cluster_config.0.preemptible_worker_config.0.instance_names +# } +#} output "name" { description = "The name of the cluster." diff --git a/modules/dataproc/variables.tf b/modules/dataproc/variables.tf index 8f1586ad98..a8164aaf24 100644 --- a/modules/dataproc/variables.tf +++ b/modules/dataproc/variables.tf @@ -193,16 +193,6 @@ variable "name" { type = string } -variable "prefix" { - description = "Optional prefix used to generate project id and name." - type = string - default = null - validation { - condition = var.prefix != "" - error_message = "Prefix cannot be empty, please use null instead." - } -} - variable "project_id" { description = "Project ID." type = string @@ -212,9 +202,3 @@ variable "region" { description = "Dataproc region." type = string } - -variable "service_account" { - description = "Service account to set on the Dataproc cluster." - type = string - default = null -}