Skip to content

Commit

Permalink
Merge #126925
Browse files Browse the repository at this point in the history
126925: roachprod: implement spot vm preemption detection for AWS. r=DarrylWong,renatolabs a=shailendra-patel

Spot vm preemption detection is not implemented for AWS. Implemented spot vm detection for AWS using ec2 describe-instances and CloudTrail events. Whenver a spot vm is evicted by AWS instance description will have information related to isntance-state, instance-lifecycle and state-reason-code. If describe instance is called after 1 hour of instance termination then there is a fallback on cloud trail events.

Fixes: #126917
Epic: CRDB-10428

Release note: None

Co-authored-by: Shailendra Patel <[email protected]>
  • Loading branch information
craig[bot] and shailendra-patel committed Jul 22, 2024
2 parents 69d5767 + f4c1ce9 commit e8357e7
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 5 deletions.
7 changes: 4 additions & 3 deletions pkg/cmd/roachtest/spec/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ func awsMachineSupportsSSD(machineType string) bool {
}

func getAWSOpts(
machineType string, zones []string, volumeSize, ebsThroughput int, localSSD bool,
machineType string, zones []string, volumeSize, ebsThroughput int, localSSD bool, useSpotVMs bool,
) vm.ProviderOpts {
opts := aws.DefaultProviderOpts()
if volumeSize != 0 {
Expand All @@ -232,6 +232,7 @@ func getAWSOpts(
if len(zones) != 0 {
opts.CreateZones = zones
}
opts.UseSpot = useSpotVMs
return opts
}

Expand Down Expand Up @@ -492,9 +493,9 @@ func (s *ClusterSpec) RoachprodOpts(
switch cloud {
case AWS:
providerOpts = getAWSOpts(machineType, zones, s.VolumeSize, s.AWS.VolumeThroughput,
createVMOpts.SSDOpts.UseLocalSSD)
createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs)
workloadProviderOpts = getAWSOpts(workloadMachineType, zones, s.VolumeSize, s.AWS.VolumeThroughput,
createVMOpts.SSDOpts.UseLocalSSD)
createVMOpts.SSDOpts.UseLocalSSD, s.UseSpotVMs)
case GCE:
providerOpts = getGCEOpts(machineType, zones, s.VolumeSize, ssdCount,
createVMOpts.SSDOpts.UseLocalSSD, s.RAID0, s.TerminateOnMigration,
Expand Down
62 changes: 60 additions & 2 deletions pkg/roachprod/vm/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"math/rand"
"os"
"os/exec"
"regexp"
"slices"
"strconv"
"strings"
Expand Down Expand Up @@ -267,13 +268,66 @@ type Provider struct {
}

func (p *Provider) SupportsSpotVMs() bool {
return false
return true
}

func (p *Provider) GetPreemptedSpotVMs(
l *logger.Logger, vms vm.List, since time.Time,
) ([]vm.PreemptedVM, error) {
return nil, nil
byRegion, err := regionMap(vms)
if err != nil {
return nil, err
}

var preemptedVMs []vm.PreemptedVM
for region, vmList := range byRegion {
args := []string{
"ec2", "describe-instances",
"--region", region,
"--instance-ids",
}
args = append(args, vmList.ProviderIDs()...)
var describeInstancesResponse DescribeInstancesOutput
err = p.runJSONCommand(l, args, &describeInstancesResponse)
if err != nil {
// if the describe-instances operation fails with the error InvalidInstanceID.NotFound,
// we assume that the instance has been preempted and describe-instances operation is attempted one hour after the instance termination
if strings.Contains(err.Error(), "InvalidInstanceID.NotFound") {
l.Errorf("WARNING: received NotFound error when trying to find preemptions: %v", err)
return vm.CreatePreemptedVMs(getInstanceIDsNotFound(err.Error())), nil
}
return nil, err
}

// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/finding-an-interrupted-Spot-Instance.html
for _, r := range describeInstancesResponse.Reservations {
for _, instance := range r.Instances {
if instance.InstanceLifecycle == "spot" &&
instance.State.Name == "terminated" &&
instance.StateReason.Code == "Server.SpotInstanceTermination" {
preemptedVMs = append(preemptedVMs, vm.PreemptedVM{Name: instance.InstanceID})
}
}
}
}

return preemptedVMs, nil
}

// getInstanceIDsNotFound returns a list of instance IDs that were not found during the describe-instances command.
//
// Sample error message:
//
// ‹An error occurred (InvalidInstanceID.NotFound) when calling the DescribeInstances operation: The instance IDs 'i-02e9adfac0e5fa18f, i-0bc7869fda0299caa' do not exist›
func getInstanceIDsNotFound(errorMsg string) []string {
// Regular expression pattern to find instance IDs between single quotes
re := regexp.MustCompile(`'([^']*)'`)
matches := re.FindStringSubmatch(errorMsg)
if len(matches) > 1 {
instanceIDsStr := matches[1]
return strings.Split(instanceIDsStr, ", ")
}
return nil
}

func (p *Provider) GetHostErrorVMs(
Expand Down Expand Up @@ -982,6 +1036,10 @@ type DescribeInstancesOutput struct {
Code int
Name string
}
StateReason struct {
Code string `json:"Code"`
Message string `json:"Message"`
} `json:"StateReason"`
RootDeviceName string

BlockDeviceMappings []struct {
Expand Down
9 changes: 9 additions & 0 deletions pkg/roachprod/vm/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,15 @@ type PreemptedVM struct {
PreemptedAt time.Time
}

// CreatePreemptedVMs returns a list of PreemptedVM created from given list of vmNames
func CreatePreemptedVMs(vmNames []string) []PreemptedVM {
preemptedVMs := make([]PreemptedVM, len(vmNames))
for i, name := range vmNames {
preemptedVMs[i] = PreemptedVM{Name: name}
}
return preemptedVMs
}

// ServiceAddress stores the IP and port of a service.
type ServiceAddress struct {
IP string
Expand Down

0 comments on commit e8357e7

Please sign in to comment.