diff --git a/.changelog/3928.txt b/.changelog/3928.txt new file mode 100644 index 00000000000..850ad30137f --- /dev/null +++ b/.changelog/3928.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +`compute`: Added graceful termination to `google_compute_instance_group_manager` create calls so that partially created instance group managers will resume the original operation if the Terraform process is killed mid create. +``` diff --git a/google/compute_operation.go b/google/compute_operation.go index 78544c473c1..2355dae95c9 100644 --- a/google/compute_operation.go +++ b/google/compute_operation.go @@ -2,7 +2,10 @@ package google import ( "bytes" + "context" + "errors" "fmt" + "log" "time" "google.golang.org/api/compute/v1" @@ -11,6 +14,7 @@ import ( type ComputeOperationWaiter struct { Service *compute.Service Op *compute.Operation + Context context.Context Project string } @@ -53,6 +57,15 @@ func (w *ComputeOperationWaiter) QueryOp() (interface{}, error) { if w == nil || w.Op == nil { return nil, fmt.Errorf("Cannot query operation, it's unset or nil.") } + if w.Context != nil { + select { + case <-w.Context.Done(): + log.Println("[WARN] request has been cancelled early") + return w.Op, errors.New("unable to finish polling, context has been cancelled") + default: + // default must be here to keep the previous case from blocking + } + } if w.Op.Zone != "" { zone := GetResourceNameFromSelfLink(w.Op.Zone) return w.Service.ZoneOperations.Get(w.Project, zone, w.Op.Name).Do() @@ -88,6 +101,7 @@ func computeOperationWaitTime(config *Config, res interface{}, project, activity w := &ComputeOperationWaiter{ Service: config.clientCompute, + Context: config.context, Op: op, Project: project, } diff --git a/google/resource_compute_instance_group_manager.go b/google/resource_compute_instance_group_manager.go index c6162706737..a28ac6f568c 100644 --- a/google/resource_compute_instance_group_manager.go +++ b/google/resource_compute_instance_group_manager.go @@ -277,6 +277,10 @@ func resourceComputeInstanceGroupManager() *schema.Resource { Default: false, Description: `Whether to wait for all instances to be created/updated before returning. Note that if this is set to true and the operation does not succeed, Terraform will continue trying until it times out.`, }, + "operation": { + Type: schema.TypeString, + Computed: true, + }, }, } } @@ -353,6 +357,18 @@ func resourceComputeInstanceGroupManagerCreate(d *schema.ResourceData, meta inte // Wait for the operation to complete err = computeOperationWaitTime(config, op, project, "Creating InstanceGroupManager", d.Timeout(schema.TimeoutCreate)) if err != nil { + // Check if the create operation failed because Terraform was prematurely terminated. If it was we can persist the + // operation id to state so that a subsequent refresh of this resource will wait until the operation has terminated + // before attempting to Read the state of the manager. This allows a graceful resumption of a Create that was killed + // by the upstream Terraform process exiting early such as a sigterm. + select { + case <-config.context.Done(): + log.Printf("[DEBUG] Persisting %s so this operation can be resumed \n", op.Name) + d.Set("operation", op.Name) + return nil + default: + // leaving default case to ensure this is non blocking + } return err } @@ -430,6 +446,24 @@ func resourceComputeInstanceGroupManagerRead(d *schema.ResourceData, meta interf return err } + operation := d.Get("operation").(string) + if operation != "" { + log.Printf("[DEBUG] in progress operation detected at %v, attempting to resume", operation) + zone, _ := getZone(d, config) + op := &computeBeta.Operation{ + Name: operation, + Zone: zone, + } + d.Set("operation", "") + err = computeOperationWaitTime(config, op, project, "Creating InstanceGroupManager", d.Timeout(schema.TimeoutCreate)) + if err != nil { + // remove from state to allow refresh to finish + log.Printf("[DEBUG] Resumed operation returned an error, removing from state: %s", err) + d.SetId("") + return nil + } + } + manager, err := getManager(d, meta) if err != nil { return err