Skip to content

Commit

Permalink
Allow IGM to resume operation after interruption (#3928) (#7153)
Browse files Browse the repository at this point in the history
Signed-off-by: Modular Magician <[email protected]>
  • Loading branch information
modular-magician authored Aug 29, 2020
1 parent c210913 commit acf78bf
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .changelog/3928.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:enhancement
`compute`: Added graceful termination to `google_compute_instance_group_manager` create calls so that partially created instance group managers will resume the original operation if the Terraform process is killed mid create.
```
14 changes: 14 additions & 0 deletions google/compute_operation.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ package google

import (
"bytes"
"context"
"errors"
"fmt"
"log"
"time"

"google.golang.org/api/compute/v1"
Expand All @@ -11,6 +14,7 @@ import (
type ComputeOperationWaiter struct {
Service *compute.Service
Op *compute.Operation
Context context.Context
Project string
}

Expand Down Expand Up @@ -53,6 +57,15 @@ func (w *ComputeOperationWaiter) QueryOp() (interface{}, error) {
if w == nil || w.Op == nil {
return nil, fmt.Errorf("Cannot query operation, it's unset or nil.")
}
if w.Context != nil {
select {
case <-w.Context.Done():
log.Println("[WARN] request has been cancelled early")
return w.Op, errors.New("unable to finish polling, context has been cancelled")
default:
// default must be here to keep the previous case from blocking
}
}
if w.Op.Zone != "" {
zone := GetResourceNameFromSelfLink(w.Op.Zone)
return w.Service.ZoneOperations.Get(w.Project, zone, w.Op.Name).Do()
Expand Down Expand Up @@ -88,6 +101,7 @@ func computeOperationWaitTime(config *Config, res interface{}, project, activity

w := &ComputeOperationWaiter{
Service: config.clientCompute,
Context: config.context,
Op: op,
Project: project,
}
Expand Down
34 changes: 34 additions & 0 deletions google/resource_compute_instance_group_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ func resourceComputeInstanceGroupManager() *schema.Resource {
Default: false,
Description: `Whether to wait for all instances to be created/updated before returning. Note that if this is set to true and the operation does not succeed, Terraform will continue trying until it times out.`,
},
"operation": {
Type: schema.TypeString,
Computed: true,
},
},
}
}
Expand Down Expand Up @@ -353,6 +357,18 @@ func resourceComputeInstanceGroupManagerCreate(d *schema.ResourceData, meta inte
// Wait for the operation to complete
err = computeOperationWaitTime(config, op, project, "Creating InstanceGroupManager", d.Timeout(schema.TimeoutCreate))
if err != nil {
// Check if the create operation failed because Terraform was prematurely terminated. If it was we can persist the
// operation id to state so that a subsequent refresh of this resource will wait until the operation has terminated
// before attempting to Read the state of the manager. This allows a graceful resumption of a Create that was killed
// by the upstream Terraform process exiting early such as a sigterm.
select {
case <-config.context.Done():
log.Printf("[DEBUG] Persisting %s so this operation can be resumed \n", op.Name)
d.Set("operation", op.Name)
return nil
default:
// leaving default case to ensure this is non blocking
}
return err
}

Expand Down Expand Up @@ -430,6 +446,24 @@ func resourceComputeInstanceGroupManagerRead(d *schema.ResourceData, meta interf
return err
}

operation := d.Get("operation").(string)
if operation != "" {
log.Printf("[DEBUG] in progress operation detected at %v, attempting to resume", operation)
zone, _ := getZone(d, config)
op := &computeBeta.Operation{
Name: operation,
Zone: zone,
}
d.Set("operation", "")
err = computeOperationWaitTime(config, op, project, "Creating InstanceGroupManager", d.Timeout(schema.TimeoutCreate))
if err != nil {
// remove from state to allow refresh to finish
log.Printf("[DEBUG] Resumed operation returned an error, removing from state: %s", err)
d.SetId("")
return nil
}
}

manager, err := getManager(d, meta)
if err != nil {
return err
Expand Down

0 comments on commit acf78bf

Please sign in to comment.