diff --git a/.changelog/4501.txt b/.changelog/4501.txt new file mode 100644 index 00000000000..4ba7d66e224 --- /dev/null +++ b/.changelog/4501.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +compute: Added graceful termination to `google_container_node_pool` create calls so that partially created node pools will resume the original operation if the Terraform process is killed mid create. +``` diff --git a/google/resource_container_node_pool.go b/google/resource_container_node_pool.go index d5007f41c9e..711bc298d33 100644 --- a/google/resource_container_node_pool.go +++ b/google/resource_container_node_pool.go @@ -63,6 +63,10 @@ func resourceContainerNodePool() *schema.Resource { ForceNew: true, Description: `The location (region or zone) of the cluster.`, }, + "operation": { + Type: schema.TypeString, + Computed: true, + }, }), } } @@ -315,6 +319,20 @@ func resourceContainerNodePoolCreate(d *schema.ResourceData, meta interface{}) e nodePoolInfo.location, "creating GKE NodePool", userAgent, timeout) if waitErr != nil { + // Check if the create operation failed because Terraform was prematurely terminated. If it was we can persist the + // operation id to state so that a subsequent refresh of this resource will wait until the operation has terminated + // before attempting to Read the state of the cluster. This allows a graceful resumption of a Create that was killed + // by the upstream Terraform process exiting early such as a sigterm. + select { + case <-config.context.Done(): + log.Printf("[DEBUG] Persisting %s so this operation can be resumed \n", operation.Name) + if err := d.Set("operation", operation.Name); err != nil { + return fmt.Errorf("Error setting operation: %s", err) + } + return nil + default: + // leaving default case to ensure this is non blocking + } // The resource didn't actually create d.SetId("") return waitErr @@ -356,6 +374,21 @@ func resourceContainerNodePoolRead(d *schema.ResourceData, meta interface{}) err return err } + operation := d.Get("operation").(string) + if operation != "" { + log.Printf("[DEBUG] in progress operation detected at %v, attempting to resume", operation) + op := &containerBeta.Operation{ + Name: operation, + } + if err := d.Set("operation", ""); err != nil { + return fmt.Errorf("Error setting operation: %s", err) + } + waitErr := containerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "resuming GKE node pool", userAgent, d.Timeout(schema.TimeoutRead)) + if waitErr != nil { + return waitErr + } + } + name := getNodePoolName(d.Id()) clusterNodePoolsGetCall := config.NewContainerBetaClient(userAgent).Projects.Locations.Clusters.NodePools.Get(nodePoolInfo.fullyQualifiedName(name))