-
Notifications
You must be signed in to change notification settings - Fork 137
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add per-queue scheduling rate-limiting #2938
Changes from 16 commits
eca1f7d
ff7ec1d
86cbf41
84f19d5
07fb8be
4d95a8d
68a1aee
8ec0e50
c3a5e38
8eb724e
c00356f
97fbb54
008d906
81d4dce
1bce27d
8d7d05c
e6af79d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -124,10 +124,33 @@ type SchedulingConfig struct { | |
MaximumResourceFractionToSchedule map[string]float64 | ||
// Overrides MaximalClusterFractionToSchedule if set for the current pool. | ||
MaximumResourceFractionToScheduleByPool map[string]map[string]float64 | ||
// Max number of jobs to schedule in each invocation of the scheduler. | ||
MaximumJobsToSchedule uint | ||
// Max number of gangs to schedule in each invocation of the scheduler. | ||
MaximumGangsToSchedule uint | ||
// The rate at which Armada schedules jobs is rate-limited using a token bucket approach. | ||
// Specifically, there is a token bucket that persists between scheduling rounds. | ||
// The bucket fills up at a rate of MaximumSchedulingRate tokens per second and has capacity MaximumSchedulingBurst. | ||
// A token is removed from the bucket when a scheduling a job and scheduling stops while the bucket is empty. | ||
// | ||
// Hence, MaximumSchedulingRate controls the maximum number of jobs scheduled per second in steady-state, | ||
// i.e., once the burst capacity has been exhausted. | ||
// | ||
// Rate-limiting is based on the number of tokens available at the start of each scheduling round, | ||
// i.e., tokens accumulated while scheduling become available at the start of the next scheduling round. | ||
// | ||
// For more information about the rate-limiter, see: | ||
// https://pkg.go.dev/golang.org/x/time/rate#Limiter | ||
MaximumSchedulingRate float64 `validate:"gt=0"` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It also might be worth exploring whether There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can leave them as floats. We can use either int or float values in config as-is. |
||
// MaximumSchedulingBurst controls the burst capacity of the rate-limiter. | ||
// | ||
// There are two important implications: | ||
// - Armada will never schedule more than MaximumSchedulingBurst jobs per scheduling round. | ||
// - Gang jobs with cardinality greater than MaximumSchedulingBurst can never be scheduled. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we check for this at submit time? I.e. is is possible for a user to submit a gang job that can never be scheduled? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. As discussed, let's add a max gang size separately and then have some warning if that is greater than the burst. |
||
MaximumSchedulingBurst int `validate:"gt=0"` | ||
// In addition to the global rate-limiter, there is a separate rate-limiter for each queue. | ||
// These work the same as the global rate-limiter, except they apply only to jobs scheduled from a specific queue. | ||
// | ||
// Per-queue version of MaximumSchedulingRate. | ||
MaximumPerQueueSchedulingRate float64 `validate:"gt=0"` | ||
// Per-queue version of MaximumSchedulingBurst. | ||
MaximumPerQueueSchedulingBurst int `validate:"gt=0"` | ||
// Armada stores contexts associated with recent job scheduling attempts. | ||
// This setting limits the number of such contexts to store. | ||
// Contexts associated with the most recent scheduling attempt for each queue and cluster are always stored. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
package constraints | ||
|
||
import ( | ||
"fmt" | ||
"math" | ||
|
||
"github.com/pkg/errors" | ||
|
@@ -12,32 +13,46 @@ import ( | |
) | ||
|
||
const ( | ||
UnschedulableReasonMaximumResourcesScheduled = "maximum resources scheduled" | ||
UnschedulableReasonMaximumNumberOfJobsScheduled = "maximum number of jobs scheduled" | ||
UnschedulableReasonMaximumNumberOfGangsScheduled = "maximum number of gangs scheduled" | ||
UnschedulableReasonMaximumResourcesPerQueueExceeded = "maximum total resources for this queue exceeded" | ||
// Indicates that the limit on resources scheduled per round has been exceeded. | ||
MaximumResourcesScheduledUnschedulableReason = "maximum resources scheduled" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it might be worth making these reasons into a type just so that functions like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree. But didn't want to do that in this PR. |
||
|
||
// Indicates that a queue has been assigned more than its allowed amount of resources. | ||
MaximumResourcesPerQueueExceededUnschedulableReason = "maximum total resources for this queue exceeded" | ||
|
||
// Indicates that the scheduling rate limit has been exceeded. | ||
GlobalRateLimitExceededUnschedulableReason = "global scheduling rate limit exceeded" | ||
QueueRateLimitExceededUnschedulableReason = "queue scheduling rate limit exceeded" | ||
|
||
// Indicates that scheduling a gang would exceed the rate limit. | ||
GlobalRateLimitExceededByGangUnschedulableReason = "gang would exceed global scheduling rate limit" | ||
QueueRateLimitExceededByGangUnschedulableReason = "gang would exceed queue scheduling rate limit" | ||
|
||
// Indicates that the number of jobs in a gang exceeds the burst size. | ||
// This means the gang can not be scheduled without first increasing the burst size. | ||
GangExceedsGlobalBurstSizeUnschedulableReason = "gang cardinality too large: exceeds global max burst size" | ||
GangExceedsQueueBurstSizeUnschedulableReason = "gang cardinality too large: exceeds queue max burst size" | ||
) | ||
|
||
// IsTerminalUnschedulableReason returns true if reason indicates it's not possible to schedule any more jobs in this round. | ||
// IsTerminalUnschedulableReason returns true if reason indicates | ||
// it's not possible to schedule any more jobs in this round. | ||
func IsTerminalUnschedulableReason(reason string) bool { | ||
if reason == UnschedulableReasonMaximumResourcesScheduled { | ||
if reason == MaximumResourcesScheduledUnschedulableReason { | ||
return true | ||
} | ||
if reason == UnschedulableReasonMaximumNumberOfJobsScheduled { | ||
return true | ||
} | ||
if reason == UnschedulableReasonMaximumNumberOfGangsScheduled { | ||
if reason == GlobalRateLimitExceededUnschedulableReason { | ||
return true | ||
} | ||
return false | ||
} | ||
|
||
// IsTerminalQueueUnschedulableReason returns true if reason indicates | ||
// it's not possible to schedule any more jobs from this queue in this round. | ||
func IsTerminalQueueUnschedulableReason(reason string) bool { | ||
return reason == QueueRateLimitExceededUnschedulableReason | ||
} | ||
|
||
// SchedulingConstraints contains scheduling constraints, e.g., per-queue resource limits. | ||
type SchedulingConstraints struct { | ||
// Max number of jobs to scheduler per lease jobs call. | ||
MaximumJobsToSchedule uint | ||
// Max number of jobs to scheduler per lease jobs call. | ||
MaximumGangsToSchedule uint | ||
// Max number of jobs to consider for a queue before giving up. | ||
MaxQueueLookback uint | ||
// Jobs leased to this executor must be at least this large. | ||
|
@@ -82,8 +97,6 @@ func SchedulingConstraintsFromSchedulingConfig( | |
maximumResourceFractionToSchedule = m | ||
} | ||
return SchedulingConstraints{ | ||
MaximumJobsToSchedule: config.MaximumJobsToSchedule, | ||
MaximumGangsToSchedule: config.MaximumGangsToSchedule, | ||
MaxQueueLookback: config.MaxQueueLookback, | ||
MinimumJobSize: minimumJobSize, | ||
MaximumResourcesToSchedule: absoluteFromRelativeLimits(totalResources, maximumResourceFractionToSchedule), | ||
|
@@ -99,47 +112,75 @@ func absoluteFromRelativeLimits(totalResources schedulerobjects.ResourceList, re | |
return absoluteLimits | ||
} | ||
|
||
func (constraints *SchedulingConstraints) CheckRoundConstraints(sctx *schedulercontext.SchedulingContext) (bool, string, error) { | ||
// MaximumJobsToSchedule check. | ||
if constraints.MaximumJobsToSchedule != 0 && sctx.NumScheduledJobs == int(constraints.MaximumJobsToSchedule) { | ||
return false, UnschedulableReasonMaximumNumberOfJobsScheduled, nil | ||
} | ||
|
||
// MaximumGangsToSchedule check. | ||
if constraints.MaximumGangsToSchedule != 0 && sctx.NumScheduledGangs == int(constraints.MaximumGangsToSchedule) { | ||
return false, UnschedulableReasonMaximumNumberOfGangsScheduled, nil | ||
} | ||
// ScaleQuantity scales q in-place by a factor f. | ||
// This functions overflows for quantities the milli value of which can't be expressed as an int64. | ||
// E.g., 1Pi is ok, but not 10Pi. | ||
func ScaleQuantity(q resource.Quantity, f float64) resource.Quantity { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know this wasn't added here but functions like this scare the hell out of me! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We may actually want to update this to return an error (or panic) if it would overflow. |
||
q.SetMilli(int64(math.Round(float64(q.MilliValue()) * f))) | ||
return q | ||
} | ||
|
||
func (constraints *SchedulingConstraints) CheckRoundConstraints(sctx *schedulercontext.SchedulingContext, queue string) (bool, string, error) { | ||
// MaximumResourcesToSchedule check. | ||
if !sctx.ScheduledResources.IsStrictlyLessOrEqual(constraints.MaximumResourcesToSchedule) { | ||
return false, UnschedulableReasonMaximumResourcesScheduled, nil | ||
return false, MaximumResourcesScheduledUnschedulableReason, nil | ||
} | ||
return true, "", nil | ||
} | ||
|
||
func (constraints *SchedulingConstraints) CheckPerQueueAndPriorityClassConstraints( | ||
func (constraints *SchedulingConstraints) CheckConstraints( | ||
sctx *schedulercontext.SchedulingContext, | ||
queue string, | ||
priorityClassName string, | ||
gctx *schedulercontext.GangSchedulingContext, | ||
) (bool, string, error) { | ||
qctx := sctx.QueueSchedulingContexts[queue] | ||
qctx := sctx.QueueSchedulingContexts[gctx.Queue] | ||
if qctx == nil { | ||
return false, "", errors.Errorf("no QueueSchedulingContext for queue %s", queue) | ||
return false, "", errors.Errorf("no QueueSchedulingContext for queue %s", gctx.Queue) | ||
} | ||
|
||
// Check that the job is large enough for this executor. | ||
if ok, unschedulableReason := RequestsAreLargeEnough(gctx.TotalResourceRequests, constraints.MinimumJobSize); !ok { | ||
return false, unschedulableReason, nil | ||
} | ||
|
||
// Global rate limiter check. | ||
tokens := sctx.Limiter.TokensAt(sctx.Started) | ||
if tokens <= 0 { | ||
return false, GlobalRateLimitExceededUnschedulableReason, nil | ||
} | ||
if sctx.Limiter.Burst() < gctx.Cardinality() { | ||
return false, GangExceedsGlobalBurstSizeUnschedulableReason, nil | ||
} | ||
if tokens < float64(gctx.Cardinality()) { | ||
return false, GlobalRateLimitExceededByGangUnschedulableReason, nil | ||
} | ||
|
||
// Per-queue rate limiter check. | ||
tokens = qctx.Limiter.TokensAt(sctx.Started) | ||
if tokens <= 0 { | ||
return false, QueueRateLimitExceededUnschedulableReason, nil | ||
} | ||
if qctx.Limiter.Burst() < gctx.Cardinality() { | ||
return false, GangExceedsQueueBurstSizeUnschedulableReason, nil | ||
} | ||
if tokens < float64(gctx.Cardinality()) { | ||
return false, QueueRateLimitExceededByGangUnschedulableReason, nil | ||
} | ||
|
||
// PriorityClassSchedulingConstraintsByPriorityClassName check. | ||
if priorityClassConstraint, ok := constraints.PriorityClassSchedulingConstraintsByPriorityClassName[priorityClassName]; ok { | ||
if !qctx.AllocatedByPriorityClass[priorityClassName].IsStrictlyLessOrEqual(priorityClassConstraint.MaximumResourcesPerQueue) { | ||
return false, UnschedulableReasonMaximumResourcesPerQueueExceeded, nil | ||
if priorityClassConstraint, ok := constraints.PriorityClassSchedulingConstraintsByPriorityClassName[gctx.PriorityClassName]; ok { | ||
if !qctx.AllocatedByPriorityClass[gctx.PriorityClassName].IsStrictlyLessOrEqual(priorityClassConstraint.MaximumResourcesPerQueue) { | ||
return false, MaximumResourcesPerQueueExceededUnschedulableReason, nil | ||
} | ||
} | ||
return true, "", nil | ||
} | ||
|
||
// ScaleQuantity scales q in-place by a factor f. | ||
// This functions overflows for quantities the milli value of which can't be expressed as an int64. | ||
// E.g., 1Pi is ok, but not 10Pi. | ||
func ScaleQuantity(q resource.Quantity, f float64) resource.Quantity { | ||
q.SetMilli(int64(math.Round(float64(q.MilliValue()) * f))) | ||
return q | ||
func RequestsAreLargeEnough(totalResourceRequests, minRequest schedulerobjects.ResourceList) (bool, string) { | ||
for t, minQuantity := range minRequest.Resources { | ||
q := totalResourceRequests.Get(t) | ||
if minQuantity.Cmp(q) == 1 { | ||
return false, fmt.Sprintf("job requests %s %s, but the minimum is %s", q.String(), t, minQuantity.String()) | ||
} | ||
} | ||
return true, "" | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I understand why you've made the rate limits act over a given scheduling cycle, but I think longer term we want to make these apply over some timer horizon. That's because what users and administrators care about isn't the scheduling rate in given scheduling cycle but rather the scheduling rate over various time horizons.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've updated the docstring to make clear the rate-limiter persists between rounds.