From 72a0c791f2db492af07c02164a6592fea2fbb608 Mon Sep 17 00:00:00 2001 From: Steven Danna Date: Fri, 19 Apr 2024 09:06:10 +0000 Subject: [PATCH] jobs: only verify fraction progressed range in tests FractionProgressed is used by jobs that do a substantial amount of work. Failing the entire job because it has miscalculated its fraction progressed estimate is not good. Here, we leave the error state for non-release builds but log in release builds. Epic: none Release note (bug fix): A job will now log rather than fail if it reports an out-of bound progress fraction. --- pkg/jobs/BUILD.bazel | 1 + pkg/jobs/jobs.go | 31 +++++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/pkg/jobs/BUILD.bazel b/pkg/jobs/BUILD.bazel index 464296abb802..00cab414184f 100644 --- a/pkg/jobs/BUILD.bazel +++ b/pkg/jobs/BUILD.bazel @@ -29,6 +29,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/base", + "//pkg/build", "//pkg/clusterversion", "//pkg/jobs/jobspb", "//pkg/kv", diff --git a/pkg/jobs/jobs.go b/pkg/jobs/jobs.go index bb7e2bd0225c..2d7208dcdd8d 100644 --- a/pkg/jobs/jobs.go +++ b/pkg/jobs/jobs.go @@ -19,6 +19,7 @@ import ( "sync/atomic" "time" + "github.com/cockroachdb/cockroach/pkg/build" "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/kv" @@ -357,16 +358,30 @@ func (u Updater) FractionProgressed(ctx context.Context, progressedFn FractionPr return err } fractionCompleted := progressedFn(ctx, md.Progress.Details) - // allow for slight floating-point rounding inaccuracies - if fractionCompleted > 1.0 && fractionCompleted < 1.01 { - fractionCompleted = 1.0 + + if !build.IsRelease() { + // We allow for slight floating-point rounding + // inaccuracies. We only want to error in non-release + // builds because in large production installations the + // method at least one job uses to calculate process can + // result in substantial floating point inaccuracy. + if fractionCompleted < 0.0 || fractionCompleted > 1.01 { + return errors.Errorf( + "fraction completed %f is outside allowable range [0.0, 1.01]", + fractionCompleted, + ) + } } - if fractionCompleted < 0.0 || fractionCompleted > 1.0 { - return errors.Errorf( - "job %d: fractionCompleted %f is outside allowable range [0.0, 1.0]", - u.j.ID(), fractionCompleted, - ) + + // Clamp to [0.0, 1.0]. + if fractionCompleted > 1.0 { + log.VInfof(ctx, 1, "clamping fraction completed %f to [0.0, 1.0]", fractionCompleted) + fractionCompleted = 1.0 + } else if fractionCompleted < 0.0 { + log.VInfof(ctx, 1, "clamping fraction completed %f to [0.0, 1.0]", fractionCompleted) + fractionCompleted = 0 } + md.Progress.Progress = &jobspb.Progress_FractionCompleted{ FractionCompleted: fractionCompleted, }