GoogleCloudPlatform · tpdownes · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024
diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md
@@ -127,6 +127,37 @@ the University of Wisconsin-Madison. Support for HTCondor is available via:
 
 [chtc]: https://chtc.cs.wisc.edu/
 
+## Behavior of Managed Instance Group (MIG)
+
+Regional [MIGs][mig] are used to provision Execute Points. By default, VMs
+will be provisioned in any of the zones available in that region, however, it
+can be constrained to run in fewer zones (or a single zone) using
+[var.zones](#input_zones).
+
+By default, the VM replacement policy is set to [opportunistic]. In practice,
+this means that the Execute Points will _NOT_ be automatically replaced by
+Terraform when changes to the instance template / HTCondor configuration are
+made. We recommend leaving this at the default value as it will allow the
+HTCondor autoscaler to replace VMs when they become idle without disrupting
+running jobs.
+
+However, if it is desired [var.update_policy][#input_update_policy] can be set
+to "PROACTIVE" to enable automatic replacement. This will disrupt running jobs
+and send them back to the queue. Alternatively, one can leave the setting at
+"OPPORTUNISTIC" and update:
+
+- intentionally by issuing an update via Cloud Console or using gcloud (below)
+- VMs becomes unhealthy or are otherwise automatically replaced (e.g. regular
+  Google Cloud maintenance)
+
+For example, to manually update all instances in a MIG:
+
+```text
+gcloud compute instance-groups managed update-instances \
+   <<NAME-OF-MIG>> --all-instances --region <<REGION>> \
+   --project <<PROJECT_ID>> --minimal-action replace
+```
+
 ## Known Issues
 
 When using OS Login with "external users" (outside of the Google Cloud
@@ -217,6 +248,7 @@ limitations under the License.
 | <a name="input_spot"></a> [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no |
 | <a name="input_subnetwork_self_link"></a> [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork HTCondor execute points will join | `string` | `null` | no |
 | <a name="input_target_size"></a> [target\_size](#input\_target\_size) | Initial size of the HTCondor execute point pool; set to null (default) to avoid Terraform management of size. | `number` | `null` | no |
+| <a name="input_update_policy"></a> [update\_policy](#input\_update\_policy) | Replacement policy for Access Point Managed Instance Group ("PROACTIVE" to replace immediately or "OPPORTUNISTIC" to replace upon instance power cycle) | `string` | `"OPPORTUNISTIC"` | no |
 | <a name="input_windows_startup_ps1"></a> [windows\_startup\_ps1](#input\_windows\_startup\_ps1) | Startup script to run at boot-time for Windows-based HTCondor execute points | `list(string)` | `[]` | no |
 | <a name="input_zones"></a> [zones](#input\_zones) | Zone(s) in which execute points may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no |
 

diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf
@@ -199,7 +199,7 @@ module "mig" {
     max_unavailable_percent      = null
     min_ready_sec                = 300
     minimal_action               = "REPLACE"
-    type                         = "OPPORTUNISTIC"
+    type                         = var.update_policy
   }]
 
 }
diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf
@@ -236,3 +236,13 @@ variable "shielded_instance_config" {
     enable_integrity_monitoring = true
   }
 }
+
+variable "update_policy" {
+  description = "Replacement policy for Access Point Managed Instance Group (\"PROACTIVE\" to replace immediately or \"OPPORTUNISTIC\" to replace upon instance power cycle)"
+  type        = string
+  default     = "OPPORTUNISTIC"
+  validation {
+    condition     = contains(["PROACTIVE", "OPPORTUNISTIC"], var.update_policy)
+    error_message = "Allowed string values for var.update_policy are \"PROACTIVE\" or \"OPPORTUNISTIC\"."
+  }
+}
diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md
@@ -27,6 +27,63 @@ the functionality in these references. Their usage is demonstrated in the
 [htcondor-pool-secrets]: ../htcondor-pool-secrets/README.md
 [IDTOKEN]: https://htcondor.readthedocs.io/en/latest/admin-manual/security.html#introducing-idtokens
 
+## Behavior of Managed Instance Group (MIG)
+
+A regional [MIG][mig] is used to provision the Access Point, although only
+1 node will ever be active at a time. By default, the node will be provisioned
+in any of the zones available in that region, however, it can be constrained to
+run in fewer zones (or a single zone) using [var.zones](#input_zones).
+
+By default, the VM replacement policy is set to [opportunistic]. In practice,
+this means that the Access Point will _NOT_ be automatically replaced by
+Terraform when changes to the instance template / HTCondor configuration are
+made. The Access Point is _NOT_ safe to replace automatically as its local storage
+contains the state of the job queue. By default, the Access Point will be
+replaced only when:
+
+- intentionally by issuing an update via Cloud Console or using gcloud (below)
+- the VM becomes unhealthy or is otherwise automatically replaced (e.g. regular
+  Google Cloud maintenance)
+
+For example, to manually update all instances in a MIG:
+
+```text
+gcloud compute instance-groups managed update-instances \
+   <<NAME-OF-MIG>> --all-instances --region <<REGION>> \
+   --project <<PROJECT_ID>> --minimal-action replace
+```
+
+This mode can be switched to "PROACTIVE" (automatic) replacement by setting
+[var.update_policy][#input_update_policy]. In this case we recommend the use of
+Filestore to store the job queue state ("spool") and setting
+[var.spool_parent_dir][#input_spool_parent_dir] to its mount point:
+
+```yaml
+  - id: spoolfs
+    source: modules/file-system/filestore
+    use:
+    - network1
+    settings:
+      filestore_tier: ENTERPRISE
+      local_mount: /shared
+
+...
+
+  - id: htcondor_access
+    source: community/modules/scheduler/htcondor-access-point
+    use:
+    - network1
+    - spoolfs
+    - htcondor_secrets
+    - htcondor_setup
+    - htcondor_cm
+    - htcondor_execute_point_group
+    settings:
+      spool_parent_dir: /shared
+```
+
+[opportunistic]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type
+
 <!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
 Copyright 2023 Google LLC
 
@@ -106,6 +163,7 @@ limitations under the License.
 | <a name="input_shielded_instance_config"></a> [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) | <pre>object({<br>    enable_secure_boot          = bool<br>    enable_vtpm                 = bool<br>    enable_integrity_monitoring = bool<br>  })</pre> | <pre>{<br>  "enable_integrity_monitoring": true,<br>  "enable_secure_boot": true,<br>  "enable_vtpm": true<br>}</pre> | no |
 | <a name="input_spool_parent_dir"></a> [spool\_parent\_dir](#input\_spool\_parent\_dir) | HTCondor access point configuration SPOOL will be set to subdirectory named "spool" | `string` | `"/var/lib/condor"` | no |
 | <a name="input_subnetwork_self_link"></a> [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no |
+| <a name="input_update_policy"></a> [update\_policy](#input\_update\_policy) | Replacement policy for Access Point Managed Instance Group ("PROACTIVE" to replace immediately or "OPPORTUNISTIC" to replace upon instance power cycle) | `string` | `"OPPORTUNISTIC"` | no |
 | <a name="input_zones"></a> [zones](#input\_zones) | Zone(s) in which access point may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no |
 
 ## Outputs

diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf
@@ -210,14 +210,14 @@ module "htcondor_ap" {
 
   update_policy = [{
     instance_redistribution_type = "NONE"
-    replacement_method           = "SUBSTITUTE"
-    max_surge_fixed              = length(local.zones)
+    replacement_method           = "RECREATE" # preserves hostnames (necessary for PROACTIVE replacement)
+    max_surge_fixed              = 0          # must be 0 to preserve hostnames
     max_unavailable_fixed        = length(local.zones)
     max_surge_percent            = null
     max_unavailable_percent      = null
     min_ready_sec                = 300
     minimal_action               = "REPLACE"
-    type                         = "OPPORTUNISTIC"
+    type                         = var.update_policy
   }]
 
   stateful_ips = [{

diff --git a/community/modules/scheduler/htcondor-access-point/variables.tf b/community/modules/scheduler/htcondor-access-point/variables.tf
@@ -216,3 +216,13 @@ variable "shielded_instance_config" {
     enable_integrity_monitoring = true
   }
 }
+
+variable "update_policy" {
+  description = "Replacement policy for Access Point Managed Instance Group (\"PROACTIVE\" to replace immediately or \"OPPORTUNISTIC\" to replace upon instance power cycle)"
+  type        = string
+  default     = "OPPORTUNISTIC"
+  validation {
+    condition     = contains(["PROACTIVE", "OPPORTUNISTIC"], var.update_policy)
+    error_message = "Allowed string values for var.update_policy are \"PROACTIVE\" or \"OPPORTUNISTIC\"."
+  }
+}
diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md
@@ -31,9 +31,14 @@ A regional [MIG][mig] is used to provision the central manager, although only
 in any of the zones available in that region, however, it can be constrained to
 run in fewer zones (or a single zone) using [var.zones](#input_zones).
 
-The VM replacement policy is set to [opportunistic]. In practice, this means
-that an active VM will not be replaced by Terraform actions, but may be
-replaced when either:
+By default, the VM replacement policy is set to [proactive]. In practice, this
+means that the Central Manager will be replaced by Terraform when changes to
+the instance template / HTCondor configuration are made. The Central Manager is
+safe to replace automatically as it gathers its state information from periodic
+messages exchanged with the rest of the HTCondor pool.
+
+This mode can be switched to "OPPORTUNISTIC" by setting [var.update_policy][#input_update_policy].
+In this case, the Central Manager will be replaced only when:
 
 - intentionally by issuing an update via Cloud Console or using gcloud (below)
 - the VM becomes unhealthy or is otherwise automatically replaced (e.g. regular
@@ -47,7 +52,7 @@ gcloud compute instance-groups managed update-instances \
    --project <<PROJECT_ID>> --minimal-action replace
 ```
 
-[opportunistic]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type
+[proactive]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type
 
 ## Limiting inter-zone egress
 
@@ -135,6 +140,7 @@ limitations under the License.
 | <a name="input_service_account_scopes"></a> [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` | <pre>[<br>  "https://www.googleapis.com/auth/cloud-platform"<br>]</pre> | no |
 | <a name="input_shielded_instance_config"></a> [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) | <pre>object({<br>    enable_secure_boot          = bool<br>    enable_vtpm                 = bool<br>    enable_integrity_monitoring = bool<br>  })</pre> | <pre>{<br>  "enable_integrity_monitoring": true,<br>  "enable_secure_boot": true,<br>  "enable_vtpm": true<br>}</pre> | no |
 | <a name="input_subnetwork_self_link"></a> [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no |
+| <a name="input_update_policy"></a> [update\_policy](#input\_update\_policy) | Replacement policy for Central Manager ("PROACTIVE" to replace immediately or "OPPORTUNISTIC" to replace upon instance power cycle). | `string` | `"PROACTIVE"` | no |
 | <a name="input_zones"></a> [zones](#input\_zones) | Zone(s) in which central manager may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no |
 
 ## Outputs

diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf
@@ -177,14 +177,14 @@ module "htcondor_cm" {
 
   update_policy = [{
     instance_redistribution_type = "NONE"
-    replacement_method           = "SUBSTITUTE"
-    max_surge_fixed              = length(local.zones)
+    replacement_method           = "RECREATE" # preserves hostnames (necessary for PROACTIVE replacement)
+    max_surge_fixed              = 0          # must be 0 to preserve hostnames
     max_unavailable_fixed        = length(local.zones)
     max_surge_percent            = null
     max_unavailable_percent      = null
     min_ready_sec                = 300
     minimal_action               = "REPLACE"
-    type                         = "OPPORTUNISTIC"
+    type                         = var.update_policy
   }]
 
   stateful_ips = [{

diff --git a/community/modules/scheduler/htcondor-central-manager/variables.tf b/community/modules/scheduler/htcondor-central-manager/variables.tf
@@ -169,3 +169,13 @@ variable "shielded_instance_config" {
     enable_integrity_monitoring = true
   }
 }
+
+variable "update_policy" {
+  description = "Replacement policy for Central Manager (\"PROACTIVE\" to replace immediately or \"OPPORTUNISTIC\" to replace upon instance power cycle)."
+  type        = string
+  default     = "PROACTIVE"
+  validation {
+    condition     = contains(["PROACTIVE", "OPPORTUNISTIC"], var.update_policy)
+    error_message = "Allowed string values for var.update_policy are \"PROACTIVE\" or \"OPPORTUNISTIC\"."
+  }
+}