diff --git a/modules/dataproc/README.md b/modules/dataproc/README.md
index 2c8b2bdf20..79f3c4a8e9 100644
--- a/modules/dataproc/README.md
+++ b/modules/dataproc/README.md
@@ -14,6 +14,7 @@ This module Manages a Google Cloud [Dataproc](https://cloud.google.com/dataproc)
- [Additive IAM](#additive-iam)
- [Variables](#variables)
- [Outputs](#outputs)
+- [Fixtures](#fixtures)
## TODO
@@ -25,39 +26,64 @@ This module Manages a Google Cloud [Dataproc](https://cloud.google.com/dataproc)
### Simple
```hcl
-module "processing-dp-cluster-2" {
+module "dataproc-cluster" {
source = "./fabric/modules/dataproc"
- project_id = "my-project"
+ project_id = var.project_id
name = "my-cluster"
- region = "europe-west1"
+ region = var.region
}
# tftest modules=1 resources=1
```
### Cluster configuration on GCE
-To set cluster configuration use the 'dataproc_config.cluster_config' variable.
+To set cluster configuration use the 'dataproc_config.cluster_config' variable. If you don't want to use dedicated service account, remember to grant `roles/dataproc.worker` to Compute Default Service Account.
```hcl
+module "dataproc-service-account" {
+ source = "./fabric/modules/iam-service-account"
+ project_id = var.project_id
+ name = "dataproc-worker"
+ iam_project_roles = {
+ (var.project_id) = ["roles/dataproc.worker"]
+ }
+}
+
+module "firewall" {
+ source = "./fabric/modules/net-vpc-firewall"
+ project_id = var.project_id
+ network = var.vpc.name
+ ingress_rules = {
+ allow-ingress-dataproc = {
+ description = "Allow all traffic between Dataproc nodes."
+ targets = ["dataproc"]
+ sources = ["dataproc"]
+ }
+ }
+}
+
module "processing-dp-cluster" {
source = "./fabric/modules/dataproc"
- project_id = "my-project"
+ project_id = var.project_id
name = "my-cluster"
- region = "europe-west1"
- prefix = "prefix"
+ region = var.region
dataproc_config = {
cluster_config = {
gce_cluster_config = {
- subnetwork = "https://www.googleapis.com/compute/v1/projects/PROJECT/regions/europe-west1/subnetworks/SUBNET"
- zone = "europe-west1-b"
- service_account = ""
- service_account_scopes = ["cloud-platform"]
internal_ip_only = true
+ service_account = module.dataproc-service-account.email
+ service_account_scopes = ["cloud-platform"]
+ subnetwork = var.subnet.self_link
+ tags = ["dataproc"]
+ zone = "${var.region}-b"
}
}
}
+ depends_on = [
+ module.dataproc-service-account, # ensure all grants are done before creating the cluster
+ ]
}
-# tftest modules=1 resources=1
+# tftest modules=3 resources=7
```
### Cluster configuration on GCE with CMEK encryption
@@ -65,55 +91,104 @@ module "processing-dp-cluster" {
To set cluster configuration use the Customer Managed Encryption key, set `dataproc_config.encryption_config.` variable. The Compute Engine service agent and the Cloud Storage service agent need to have `CryptoKey Encrypter/Decrypter` role on they configured KMS key ([Documentation](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/customer-managed-encryption)).
```hcl
+module "dataproc-service-account" {
+ source = "./fabric/modules/iam-service-account"
+ project_id = var.project_id
+ name = "dataproc-worker"
+ iam_project_roles = {
+ (var.project_id) = ["roles/dataproc.worker", "roles/cloudkms.cryptoKeyEncrypterDecrypter"]
+ }
+}
+
+module "firewall" {
+ source = "./fabric/modules/net-vpc-firewall"
+ project_id = var.project_id
+ network = var.vpc.name
+ ingress_rules = {
+ allow-ingress-dataproc = {
+ description = "Allow all traffic between Dataproc nodes."
+ targets = ["dataproc"]
+ sources = ["dataproc"]
+ }
+ }
+}
+
module "processing-dp-cluster" {
source = "./fabric/modules/dataproc"
- project_id = "my-project"
+ project_id = var.project_id
name = "my-cluster"
- region = "europe-west1"
- prefix = "prefix"
+ region = var.region
dataproc_config = {
cluster_config = {
gce_cluster_config = {
- subnetwork = "https://www.googleapis.com/compute/v1/projects/PROJECT/regions/europe-west1/subnetworks/SUBNET"
- zone = "europe-west1-b"
- service_account = ""
- service_account_scopes = ["cloud-platform"]
internal_ip_only = true
+ service_account = module.dataproc-service-account.email
+ service_account_scopes = ["cloud-platform"]
+ subnetwork = var.subnet.self_link
+ tags = ["dataproc"]
+ zone = "${var.region}-b"
}
}
encryption_config = {
- kms_key_name = "projects/project-id/locations/region/keyRings/key-ring-name/cryptoKeys/key-name"
+ kms_key_name = var.kms_key.id
}
}
+ depends_on = [
+ module.dataproc-service-account, # ensure all grants are done before creating the cluster
+ ]
}
-# tftest modules=1 resources=1
+# tftest modules=3 resources=8
```
### Cluster configuration on GKE
-To set cluster configuration GKE use the 'dataproc_config.virtual_cluster_config' variable.
+To set cluster configuration GKE use the 'dataproc_config.virtual_cluster_config' variable. This example shows usage of [dedicated Service Account](https://cloud.google.com/dataproc/docs/guides/dpgke/dataproc-gke-iam#custom_iam_configuration).
```hcl
+locals {
+ dataproc_namespace = "foobar"
+}
+
+module "dataproc-service-account" {
+ source = "./fabric/modules/iam-service-account"
+ project_id = var.project_id
+ name = "dataproc-worker"
+ iam = {
+ "roles/iam.workloadIdentityUser" = [
+ "serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/agent]",
+ "serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/spark-driver]",
+ "serviceAccount:${var.project_id}.svc.id.goog[${local.dataproc_namespace}/spark-executor]"
+ ]
+ }
+ iam_project_roles = {
+ (var.project_id) = ["roles/dataproc.worker"]
+ }
+ depends_on = [
+ module.gke-cluster-standard, # granting workloadIdentityUser requires cluster/pool to be created first
+ ]
+}
+
module "processing-dp-cluster" {
source = "./fabric/modules/dataproc"
- project_id = "my-project"
- name = "my-gke-cluster"
- region = "europe-west1"
- prefix = "prefix"
+ project_id = var.project_id
+ name = "my-dataproc-cluster"
+ region = var.region
dataproc_config = {
virtual_cluster_config = {
kubernetes_cluster_config = {
- kubernetes_namespace = "foobar"
+ kubernetes_namespace = local.dataproc_namespace
kubernetes_software_config = {
component_version = {
- "SPARK" : "3.1-dataproc-7"
+ "SPARK" : "3.1-dataproc-14"
}
properties = {
- "spark:spark.kubernetes.container.image" : "us-east4-docker.pkg.dev/cloud-dataproc/dpgke/sparkengine:dataproc-14"
+ "dataproc:dataproc.gke.agent.google-service-account" = module.dataproc-service-account.email
+ "dataproc:dataproc.gke.spark.driver.google-service-account" = module.dataproc-service-account.email
+ "dataproc:dataproc.gke.spark.executor.google-service-account" = module.dataproc-service-account.email
}
}
gke_cluster_config = {
- gke_cluster_target = "projects/my-project/locations/my-location/clusters/gke-cluster-name"
+ gke_cluster_target = module.gke-cluster-standard.id
node_pool_target = {
node_pool = "node-pool-name"
roles = ["DEFAULT"]
@@ -123,7 +198,7 @@ module "processing-dp-cluster" {
}
}
}
-# tftest modules=1 resources=1
+# tftest modules=4 resources=6 fixtures=fixtures/gke-cluster-standard.tf e2e
```
## IAM
@@ -143,10 +218,9 @@ Refer to the [project module](../project/README.md#iam) for examples of the IAM
```hcl
module "processing-dp-cluster" {
source = "./fabric/modules/dataproc"
- project_id = "my-project"
+ project_id = var.project_id
name = "my-cluster"
- region = "europe-west1"
- prefix = "prefix"
+ region = var.region
iam_by_principals = {
"group:gcp-data-engineers@example.net" = [
"roles/dataproc.viewer"
@@ -166,10 +240,9 @@ module "processing-dp-cluster" {
```hcl
module "processing-dp-cluster" {
source = "./fabric/modules/dataproc"
- project_id = "my-project"
+ project_id = var.project_id
name = "my-cluster"
- region = "europe-west1"
- prefix = "prefix"
+ region = var.region
iam_bindings_additive = {
am1-viewer = {
member = "user:am1@example.com"
@@ -185,24 +258,23 @@ module "processing-dp-cluster" {
| name | description | type | required | default |
|---|---|:---:|:---:|:---:|
| [name](variables.tf#L191) | Cluster name. | string
| ✓ | |
-| [project_id](variables.tf#L206) | Project ID. | string
| ✓ | |
-| [region](variables.tf#L211) | Dataproc region. | string
| ✓ | |
+| [project_id](variables.tf#L196) | Project ID. | string
| ✓ | |
+| [region](variables.tf#L201) | Dataproc region. | string
| ✓ | |
| [dataproc_config](variables.tf#L17) | Dataproc cluster config. | object({…})
| | {}
|
| [iam](variables-iam.tf#L24) | IAM bindings in {ROLE => [MEMBERS]} format. | map(list(string))
| | {}
|
| [iam_bindings](variables-iam.tf#L31) | Authoritative IAM bindings in {KEY => {role = ROLE, members = [], condition = {}}}. Keys are arbitrary. | map(object({…}))
| | {}
|
| [iam_bindings_additive](variables-iam.tf#L46) | Individual additive IAM bindings. Keys are arbitrary. | map(object({…}))
| | {}
|
| [iam_by_principals](variables-iam.tf#L17) | Authoritative IAM binding in {PRINCIPAL => [ROLES]} format. Principals need to be statically defined to avoid cycle errors. Merged internally with the `iam` variable. | map(list(string))
| | {}
|
| [labels](variables.tf#L185) | The resource labels for instance to use to annotate any related underlying resources, such as Compute Engine VMs. | map(string)
| | {}
|
-| [prefix](variables.tf#L196) | Optional prefix used to generate project id and name. | string
| | null
|
-| [service_account](variables.tf#L216) | Service account to set on the Dataproc cluster. | string
| | null
|
## Outputs
| name | description | sensitive |
|---|---|:---:|
-| [bucket_names](outputs.tf#L19) | List of bucket names which have been assigned to the cluster. | |
-| [http_ports](outputs.tf#L24) | The map of port descriptions to URLs. | |
-| [id](outputs.tf#L29) | Fully qualified cluster id. | |
-| [instance_names](outputs.tf#L34) | List of instance names which have been assigned to the cluster. | |
-| [name](outputs.tf#L43) | The name of the cluster. | |
+| [id](outputs.tf#L30) | Fully qualified cluster id. | |
+| [name](outputs.tf#L45) | The name of the cluster. | |
+
+## Fixtures
+
+- [gke-cluster-standard.tf](../../tests/fixtures/gke-cluster-standard.tf)
diff --git a/modules/dataproc/main.tf b/modules/dataproc/main.tf
index 75fc4ccbf0..774988fb55 100644
--- a/modules/dataproc/main.tf
+++ b/modules/dataproc/main.tf
@@ -16,12 +16,8 @@
# tfdoc:file:description Cloud Dataproc resource definition.
-locals {
- prefix = var.prefix == null ? "" : "${var.prefix}-"
-}
-
resource "google_dataproc_cluster" "cluster" {
- name = "${local.prefix}${var.name}"
+ name = var.name
project = var.project_id
region = var.region
graceful_decommission_timeout = var.dataproc_config.graceful_decommission_timeout
diff --git a/modules/dataproc/outputs.tf b/modules/dataproc/outputs.tf
index 51edb807d4..ae195c8b4d 100644
--- a/modules/dataproc/outputs.tf
+++ b/modules/dataproc/outputs.tf
@@ -16,29 +16,31 @@
# tfdoc:file:description Cloud Dataproc module output.
-output "bucket_names" {
- description = "List of bucket names which have been assigned to the cluster."
- value = google_dataproc_cluster.cluster.cluster_config.0.bucket
-}
-
-output "http_ports" {
- description = "The map of port descriptions to URLs."
- value = google_dataproc_cluster.cluster.cluster_config.0.endpoint_config.0.http_ports
-}
+# FIXME: 2024-03-08: broken in provider
+#output "bucket_names" {
+# description = "List of bucket names which have been assigned to the cluster."
+# value = google_dataproc_cluster.cluster.cluster_config.0.bucket
+#}
+#
+#output "http_ports" {
+# description = "The map of port descriptions to URLs."
+# value = google_dataproc_cluster.cluster.cluster_config.0.endpoint_config.0.http_ports
+#}
output "id" {
description = "Fully qualified cluster id."
value = google_dataproc_cluster.cluster.id
}
-output "instance_names" {
- description = "List of instance names which have been assigned to the cluster."
- value = {
- master = google_dataproc_cluster.cluster.cluster_config.0.master_config.0.instance_names
- worker = google_dataproc_cluster.cluster.cluster_config.0.worker_config.0.instance_names
- preemptible_worker = google_dataproc_cluster.cluster.cluster_config.0.preemptible_worker_config.0.instance_names
- }
-}
+# FIXME: 2024-03-08: broken in provider
+#output "instance_names" {
+# description = "List of instance names which have been assigned to the cluster."
+# value = {
+# master = google_dataproc_cluster.cluster.cluster_config.0.master_config.0.instance_names
+# worker = google_dataproc_cluster.cluster.cluster_config.0.worker_config.0.instance_names
+# preemptible_worker = google_dataproc_cluster.cluster.cluster_config.0.preemptible_worker_config.0.instance_names
+# }
+#}
output "name" {
description = "The name of the cluster."
diff --git a/modules/dataproc/variables.tf b/modules/dataproc/variables.tf
index 8f1586ad98..a8164aaf24 100644
--- a/modules/dataproc/variables.tf
+++ b/modules/dataproc/variables.tf
@@ -193,16 +193,6 @@ variable "name" {
type = string
}
-variable "prefix" {
- description = "Optional prefix used to generate project id and name."
- type = string
- default = null
- validation {
- condition = var.prefix != ""
- error_message = "Prefix cannot be empty, please use null instead."
- }
-}
-
variable "project_id" {
description = "Project ID."
type = string
@@ -212,9 +202,3 @@ variable "region" {
description = "Dataproc region."
type = string
}
-
-variable "service_account" {
- description = "Service account to set on the Dataproc cluster."
- type = string
- default = null
-}