From 38738d91520c7108df782e78256a480ea9943e8e Mon Sep 17 00:00:00 2001 From: David Moravek Date: Thu, 26 Oct 2023 12:52:04 +0200 Subject: [PATCH] [FLINK-35553][runtime] Wire-up RescaleManager with CheckpointLifecycleListener in Executing state --- .../generated/all_jobmanager_section.html | 8 +- .../generated/expert_scheduling_section.html | 8 +- .../generated/job_manager_configuration.html | 8 +- .../configuration/JobManagerOptions.java | 21 ++- .../checkpoint/CheckpointStatsListener.java | 33 ++++ .../DefaultCheckpointStatsTracker.java | 29 ++++ .../scheduler/adaptive/AdaptiveScheduler.java | 148 +++++++++++++++-- .../adaptive/DefaultRescaleManager.java | 43 ++--- .../runtime/scheduler/adaptive/Executing.java | 57 ++++++- .../runtime/scheduler/adaptive/State.java | 34 +++- .../DefaultCheckpointStatsTrackerTest.java | 67 ++++++++ .../adaptive/AdaptiveSchedulerBuilder.java | 40 ++++- .../adaptive/AdaptiveSchedulerTest.java | 156 +++++++++++++++++- .../adaptive/DefaultRescaleManagerTest.java | 3 +- .../scheduler/adaptive/ExecutingTest.java | 128 +++++++++++++- .../scheduling/RescaleOnCheckpointITCase.java | 146 ++++++++++++++++ 16 files changed, 864 insertions(+), 65 deletions(-) create mode 100644 flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointStatsListener.java create mode 100644 flink-tests/src/test/java/org/apache/flink/test/scheduling/RescaleOnCheckpointITCase.java diff --git a/docs/layouts/shortcodes/generated/all_jobmanager_section.html b/docs/layouts/shortcodes/generated/all_jobmanager_section.html index 4539ce365d5816..760ca029e04755 100644 --- a/docs/layouts/shortcodes/generated/all_jobmanager_section.html +++ b/docs/layouts/shortcodes/generated/all_jobmanager_section.html @@ -12,7 +12,7 @@
jobmanager.adaptive-scheduler.max-delay-for-scale-trigger
(none) Duration - The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and %dx of the checkpointing interval if checkpointing is enabled). + The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and the checkpointing interval multiplied by the by-1-incremented parameter value of jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count if checkpointing is enabled).
jobmanager.adaptive-scheduler.min-parallelism-increase
@@ -32,6 +32,12 @@ Duration The maximum time the JobManager will wait to acquire all required resources after a job submission or restart. Once elapsed it will try to run the job with a lower parallelism, or fail if the minimum amount of resources could not be acquired.
Increasing this value will make the cluster more resilient against temporary resources shortages (e.g., there is more time for a failed TaskManager to be restarted).
Setting a negative duration will disable the resource timeout: The JobManager will wait indefinitely for resources to appear.
If scheduler-mode is configured to REACTIVE, this configuration value will default to a negative value to disable the resource timeout. + +
jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count
+ 2 + Integer + The number of consecutive failed checkpoints that will trigger rescaling even in the absence of a completed checkpoint. +
jobmanager.adaptive-scheduler.scaling-interval.max
(none) diff --git a/docs/layouts/shortcodes/generated/expert_scheduling_section.html b/docs/layouts/shortcodes/generated/expert_scheduling_section.html index 10e5ad134cecef..6be6547547cad8 100644 --- a/docs/layouts/shortcodes/generated/expert_scheduling_section.html +++ b/docs/layouts/shortcodes/generated/expert_scheduling_section.html @@ -90,7 +90,7 @@
jobmanager.adaptive-scheduler.max-delay-for-scale-trigger
(none) Duration - The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and %dx of the checkpointing interval if checkpointing is enabled). + The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and the checkpointing interval multiplied by the by-1-incremented parameter value of jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count if checkpointing is enabled).
jobmanager.adaptive-scheduler.min-parallelism-increase
@@ -110,6 +110,12 @@ Duration The maximum time the JobManager will wait to acquire all required resources after a job submission or restart. Once elapsed it will try to run the job with a lower parallelism, or fail if the minimum amount of resources could not be acquired.
Increasing this value will make the cluster more resilient against temporary resources shortages (e.g., there is more time for a failed TaskManager to be restarted).
Setting a negative duration will disable the resource timeout: The JobManager will wait indefinitely for resources to appear.
If scheduler-mode is configured to REACTIVE, this configuration value will default to a negative value to disable the resource timeout. + +
jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count
+ 2 + Integer + The number of consecutive failed checkpoints that will trigger rescaling even in the absence of a completed checkpoint. +
jobmanager.adaptive-scheduler.scaling-interval.max
(none) diff --git a/docs/layouts/shortcodes/generated/job_manager_configuration.html b/docs/layouts/shortcodes/generated/job_manager_configuration.html index df84946a5b0709..3b2ecf56ffe8ef 100644 --- a/docs/layouts/shortcodes/generated/job_manager_configuration.html +++ b/docs/layouts/shortcodes/generated/job_manager_configuration.html @@ -12,7 +12,7 @@
jobmanager.adaptive-scheduler.max-delay-for-scale-trigger
(none) Duration - The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and %dx of the checkpointing interval if checkpointing is enabled). + The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and the checkpointing interval multiplied by the by-1-incremented parameter value of jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count if checkpointing is enabled).
jobmanager.adaptive-scheduler.min-parallelism-increase
@@ -32,6 +32,12 @@ Duration The maximum time the JobManager will wait to acquire all required resources after a job submission or restart. Once elapsed it will try to run the job with a lower parallelism, or fail if the minimum amount of resources could not be acquired.
Increasing this value will make the cluster more resilient against temporary resources shortages (e.g., there is more time for a failed TaskManager to be restarted).
Setting a negative duration will disable the resource timeout: The JobManager will wait indefinitely for resources to appear.
If scheduler-mode is configured to REACTIVE, this configuration value will default to a negative value to disable the resource timeout. + +
jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count
+ 2 + Integer + The number of consecutive failed checkpoints that will trigger rescaling even in the absence of a completed checkpoint. +
jobmanager.adaptive-scheduler.scaling-interval.max
(none) diff --git a/flink-core/src/main/java/org/apache/flink/configuration/JobManagerOptions.java b/flink-core/src/main/java/org/apache/flink/configuration/JobManagerOptions.java index ab95a0e2669422..e1fdb25c48b128 100644 --- a/flink-core/src/main/java/org/apache/flink/configuration/JobManagerOptions.java +++ b/flink-core/src/main/java/org/apache/flink/configuration/JobManagerOptions.java @@ -39,7 +39,6 @@ public class JobManagerOptions { public static final MemorySize MIN_JVM_HEAP_SIZE = MemorySize.ofMebiBytes(128); - public static final int FACTOR_FOR_DEFAULT_MAXIMUM_DELAY_FOR_RESCALE_TRIGGER = 3; /** * The config parameter defining the network address to connect to for communication with the @@ -574,6 +573,20 @@ public InlineElement getDescription() { code(SchedulerExecutionMode.REACTIVE.name())) .build()); + @Documentation.Section({ + Documentation.Sections.EXPERT_SCHEDULING, + Documentation.Sections.ALL_JOB_MANAGER + }) + public static final ConfigOption SCHEDULER_SCALE_ON_FAILED_CHECKPOINTS_COUNT = + key("jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count") + .intType() + .defaultValue(2) + .withDescription( + Description.builder() + .text( + "The number of consecutive failed checkpoints that will trigger rescaling even in the absence of a completed checkpoint.") + .build()); + @Documentation.Section({ Documentation.Sections.EXPERT_SCHEDULING, Documentation.Sections.ALL_JOB_MANAGER @@ -586,10 +599,8 @@ public InlineElement getDescription() { Description.builder() .text( "The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled " - + "and %dx of the checkpointing interval if checkpointing is enabled).", - text( - String.valueOf( - FACTOR_FOR_DEFAULT_MAXIMUM_DELAY_FOR_RESCALE_TRIGGER))) + + "and the checkpointing interval multiplied by the by-1-incremented parameter value of %s if checkpointing is enabled).", + text(SCHEDULER_SCALE_ON_FAILED_CHECKPOINTS_COUNT.key())) .build()); @Documentation.Section({ diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointStatsListener.java b/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointStatsListener.java new file mode 100644 index 00000000000000..752e0881e716ac --- /dev/null +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointStatsListener.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.checkpoint; + +/** An interface that allows listening on the checkpoint lifecycle. */ +public interface CheckpointStatsListener { + + /** Called when a checkpoint was completed successfully. */ + default void onCompletedCheckpoint() { + // No-op. + } + + /** Called when a checkpoint failed. */ + default void onFailedCheckpoint() { + // No-op. + } +} diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/DefaultCheckpointStatsTracker.java b/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/DefaultCheckpointStatsTracker.java index 9e541b1b48b515..79a4e609503c83 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/DefaultCheckpointStatsTracker.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/DefaultCheckpointStatsTracker.java @@ -73,6 +73,7 @@ public class DefaultCheckpointStatsTracker implements CheckpointStatsTracker { private Optional jobInitializationMetricsBuilder = Optional.empty(); + @Nullable private final CheckpointStatsListener checkpointStatsListener; /** Latest created snapshot. */ private volatile CheckpointStatsSnapshot latestSnapshot; @@ -95,9 +96,25 @@ public class DefaultCheckpointStatsTracker implements CheckpointStatsTracker { */ public DefaultCheckpointStatsTracker( int numRememberedCheckpoints, JobManagerJobMetricGroup metricGroup) { + this(numRememberedCheckpoints, metricGroup, null); + } + + /** + * Creates a new checkpoint stats tracker. + * + * @param numRememberedCheckpoints Maximum number of checkpoints to remember, including in + * progress ones. + * @param metricGroup Metric group for exposed metrics. + * @param checkpointStatsListener Listener for monitoring checkpoint-related events. + */ + public DefaultCheckpointStatsTracker( + int numRememberedCheckpoints, + JobManagerJobMetricGroup metricGroup, + @Nullable CheckpointStatsListener checkpointStatsListener) { checkArgument(numRememberedCheckpoints >= 0, "Negative number of remembered checkpoints"); this.history = new CheckpointStatsHistory(numRememberedCheckpoints); this.metricGroup = metricGroup; + this.checkpointStatsListener = checkpointStatsListener; // Latest snapshot is empty latestSnapshot = @@ -203,6 +220,10 @@ public void reportCompletedCheckpoint(CompletedCheckpointStats completed) { dirty = true; logCheckpointStatistics(completed); + + if (checkpointStatsListener != null) { + checkpointStatsListener.onCompletedCheckpoint(); + } } finally { statsReadWriteLock.unlock(); } @@ -217,6 +238,10 @@ public void reportFailedCheckpoint(FailedCheckpointStats failed) { dirty = true; logCheckpointStatistics(failed); + + if (checkpointStatsListener != null) { + checkpointStatsListener.onFailedCheckpoint(); + } } finally { statsReadWriteLock.unlock(); } @@ -256,6 +281,10 @@ public void reportFailedCheckpointsWithoutInProgress() { counts.incrementFailedCheckpointsWithoutInProgress(); dirty = true; + + if (checkpointStatsListener != null) { + checkpointStatsListener.onFailedCheckpoint(); + } } finally { statsReadWriteLock.unlock(); } diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java index ea27ab6888644e..fb6344b31dfcf2 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveScheduler.java @@ -42,6 +42,7 @@ import org.apache.flink.runtime.checkpoint.CheckpointMetrics; import org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory; import org.apache.flink.runtime.checkpoint.CheckpointScheduling; +import org.apache.flink.runtime.checkpoint.CheckpointStatsListener; import org.apache.flink.runtime.checkpoint.CheckpointStatsSnapshot; import org.apache.flink.runtime.checkpoint.CheckpointStatsTracker; import org.apache.flink.runtime.checkpoint.CheckpointsCleaner; @@ -118,6 +119,7 @@ import org.apache.flink.runtime.state.KeyGroupRange; import org.apache.flink.runtime.util.BoundedFIFOQueue; import org.apache.flink.runtime.util.ResourceCounter; +import org.apache.flink.util.ConfigurationException; import org.apache.flink.util.ExceptionUtils; import org.apache.flink.util.FlinkException; import org.apache.flink.util.Preconditions; @@ -145,6 +147,7 @@ import java.util.concurrent.Executor; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; +import java.util.function.BiFunction; import java.util.function.Function; import static org.apache.flink.configuration.JobManagerOptions.MAXIMUM_DELAY_FOR_SCALE_TRIGGER; @@ -188,13 +191,14 @@ public class AdaptiveScheduler */ public static class Settings { - public static Settings of(Configuration configuration) { + public static Settings of(Configuration configuration) throws ConfigurationException { return of(configuration, null); } public static Settings of( Configuration configuration, - @Nullable JobCheckpointingSettings checkpointingConfiguration) { + @Nullable JobCheckpointingSettings checkpointingConfiguration) + throws ConfigurationException { final SchedulerExecutionMode executionMode = configuration.get(JobManagerOptions.SCHEDULER_MODE); Duration allocationTimeoutDefault = @@ -224,11 +228,36 @@ public static Settings of( scalingIntervalMin); } + final int rescaleOnFailedCheckpointsCount = + configuration.get( + JobManagerOptions.SCHEDULER_SCALE_ON_FAILED_CHECKPOINTS_COUNT); + if (rescaleOnFailedCheckpointsCount < 1) { + throw new ConfigurationException( + String.format( + "%s should have a value of 1 or higher.", + JobManagerOptions.SCHEDULER_SCALE_ON_FAILED_CHECKPOINTS_COUNT + .key())); + } + + // default value generation is documented in JobManagerOption final Duration maximumDelayForRescaleTriggerDefault = checkpointingConfiguration != null + && checkpointingConfiguration + .getCheckpointCoordinatorConfiguration() + .isCheckpointingEnabled() + // incrementing the rescaleOnFailedCheckpointsCount by 1 is done to + // avoid introducing a race-condition between the two parameters + // (SCHEDULER_SCALE_ON_FAILED_CHECKPOINTS_COUNT and + // MAXIMUM_DELAY_FOR_SCALE_TRIGGER). Without the increment, we would + // have two configuration parameters that result in roughly the same + // timeout (with the MAXIMUM_DELAY_FOR_SCALE_TRIGGER being probably a + // bit faster). The user might experience unexpected behavior if the + // SCHEDULER_SCALE_ON_FAILED_CHECKPOINTS_COUNT is configured and + // MAXIMUM_DELAY_FOR_SCALE_TRIGGER is kept untouched in that case. + // Incrementing the default value should help avoiding causing this kind + // of confusing race condition. ? Duration.ofMillis( - JobManagerOptions - .FACTOR_FOR_DEFAULT_MAXIMUM_DELAY_FOR_RESCALE_TRIGGER + (rescaleOnFailedCheckpointsCount + 1) * checkpointingConfiguration .getCheckpointCoordinatorConfiguration() .getCheckpointInterval()) @@ -247,7 +276,8 @@ public static Settings of( scalingIntervalMax, configuration.get(MIN_PARALLELISM_INCREASE), configuration.get( - MAXIMUM_DELAY_FOR_SCALE_TRIGGER, maximumDelayForRescaleTriggerDefault)); + MAXIMUM_DELAY_FOR_SCALE_TRIGGER, maximumDelayForRescaleTriggerDefault), + rescaleOnFailedCheckpointsCount); } private final SchedulerExecutionMode executionMode; @@ -257,6 +287,7 @@ public static Settings of( private final Duration scalingIntervalMin; private final Duration scalingIntervalMax; private final Duration maximumDelayForTriggeringRescale; + private final int rescaleOnFailedCheckpointCount; private final int minParallelismChangeForDesiredRescale; private Settings( @@ -267,7 +298,8 @@ private Settings( Duration scalingIntervalMin, Duration scalingIntervalMax, int minParallelismChangeForDesiredRescale, - Duration maximumDelayForTriggeringRescale) { + Duration maximumDelayForTriggeringRescale, + int rescaleOnFailedCheckpointCount) { this.executionMode = executionMode; this.initialResourceAllocationTimeout = initialResourceAllocationTimeout; this.resourceStabilizationTimeout = resourceStabilizationTimeout; @@ -276,6 +308,7 @@ private Settings( this.scalingIntervalMax = scalingIntervalMax; this.minParallelismChangeForDesiredRescale = minParallelismChangeForDesiredRescale; this.maximumDelayForTriggeringRescale = maximumDelayForTriggeringRescale; + this.rescaleOnFailedCheckpointCount = rescaleOnFailedCheckpointCount; } public SchedulerExecutionMode getExecutionMode() { @@ -309,6 +342,10 @@ public int getMinParallelismChangeForDesiredRescale() { public Duration getMaximumDelayForTriggeringRescale() { return maximumDelayForTriggeringRescale; } + + public int getRescaleOnFailedCheckpointCount() { + return rescaleOnFailedCheckpointCount; + } } private final Settings settings; @@ -388,11 +425,62 @@ public AdaptiveScheduler( Collection failureEnrichers, ExecutionGraphFactory executionGraphFactory) throws JobExecutionException { + this( + settings, + DefaultRescaleManager.Factory.fromSettings(settings), + (metricGroup, checkpointStatsListener) -> + new DefaultCheckpointStatsTracker( + configuration.get(WebOptions.CHECKPOINTS_HISTORY_SIZE), + metricGroup, + checkpointStatsListener), + jobGraph, + jobResourceRequirements, + configuration, + declarativeSlotPool, + slotAllocator, + ioExecutor, + userCodeClassLoader, + checkpointsCleaner, + checkpointRecoveryFactory, + jobManagerJobMetricGroup, + restartBackoffTimeStrategy, + initializationTimestamp, + mainThreadExecutor, + fatalErrorHandler, + jobStatusListener, + failureEnrichers, + executionGraphFactory); + } + + @VisibleForTesting + AdaptiveScheduler( + Settings settings, + RescaleManager.Factory rescaleManagerFactory, + BiFunction + checkpointStatsTrackerFactory, + JobGraph jobGraph, + @Nullable JobResourceRequirements jobResourceRequirements, + Configuration configuration, + DeclarativeSlotPool declarativeSlotPool, + SlotAllocator slotAllocator, + Executor ioExecutor, + ClassLoader userCodeClassLoader, + CheckpointsCleaner checkpointsCleaner, + CheckpointRecoveryFactory checkpointRecoveryFactory, + JobManagerJobMetricGroup jobManagerJobMetricGroup, + RestartBackoffTimeStrategy restartBackoffTimeStrategy, + long initializationTimestamp, + ComponentMainThreadExecutor mainThreadExecutor, + FatalErrorHandler fatalErrorHandler, + JobStatusListener jobStatusListener, + Collection failureEnrichers, + ExecutionGraphFactory executionGraphFactory) + throws JobExecutionException { assertPreconditions(jobGraph); this.settings = settings; - this.rescaleManagerFactory = DefaultRescaleManager.Factory.fromSettings(settings); + this.rescaleManagerFactory = rescaleManagerFactory; this.jobGraph = jobGraph; this.jobInfo = new JobInfoImpl(jobGraph.getJobID(), jobGraph.getName()); @@ -426,9 +514,8 @@ public AdaptiveScheduler( SchedulerUtils.createCheckpointStatsTrackerIfCheckpointingIsEnabled( jobGraph, () -> - new DefaultCheckpointStatsTracker( - configuration.get(WebOptions.CHECKPOINTS_HISTORY_SIZE), - jobManagerJobMetricGroup)); + checkpointStatsTrackerFactory.apply( + jobManagerJobMetricGroup, createCheckpointStatsListener())); this.slotAllocator = slotAllocator; @@ -1089,7 +1176,8 @@ public void goToExecuting( userCodeClassLoader, failureCollection, rescaleManagerFactory, - settings.getMinParallelismChangeForDesiredRescale())); + settings.getMinParallelismChangeForDesiredRescale(), + settings.getRescaleOnFailedCheckpointCount())); } @Override @@ -1497,4 +1585,42 @@ private void checkIdleSlotTimeout() { settings.getSlotIdleTimeout().toMillis(), TimeUnit.MILLISECONDS); } + + /** + * Wrapper class implementing {@link CheckpointStatsListener} in a way that checkpoint-related + * events are actually executed in the {@code AdaptiveScheduler}'s main thread. + */ + private CheckpointStatsListener createCheckpointStatsListener() { + + return new CheckpointStatsListener() { + + @Override + public void onFailedCheckpoint() { + runIfSupported(CheckpointStatsListener::onFailedCheckpoint, "onFailedCheckpoint"); + } + + @Override + public void onCompletedCheckpoint() { + runIfSupported( + CheckpointStatsListener::onCompletedCheckpoint, "onCompletedCheckpoint"); + } + + private void runIfSupported( + ThrowingConsumer callback, + String callbackLabel) { + AdaptiveScheduler.this + .getMainThreadExecutor() + .execute( + () -> + state.tryRun( + CheckpointStatsListener.class, + callback, + logger -> + logger.debug( + "{} is not supported by {}.", + callbackLabel, + state.getClass().getName()))); + } + }; + } } diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/DefaultRescaleManager.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/DefaultRescaleManager.java index 69c11f1b8129bb..0b0fc013357e09 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/DefaultRescaleManager.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/DefaultRescaleManager.java @@ -26,6 +26,7 @@ import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import javax.annotation.concurrent.NotThreadSafe; import java.time.Duration; import java.time.Instant; @@ -47,8 +48,12 @@ * available (its lower threshold is defined by (@code scalingIntervalMax}). * * + *

Thread-safety: This class is not implemented in a thread-safe manner and relies on the fact + * that any method call happens within a single thread. + * * @see Executing */ +@NotThreadSafe public class DefaultRescaleManager implements RescaleManager { private static final Logger LOG = LoggerFactory.getLogger(DefaultRescaleManager.class); @@ -116,37 +121,21 @@ public class DefaultRescaleManager implements RescaleManager { @Override public void onChange() { - runInContextMainThread( - () -> { - if (this.triggerFuture.isDone()) { - this.triggerFuture = - scheduleOperationWithTrigger(this::evaluateChangeEvent); - } - }); + if (this.triggerFuture.isDone()) { + this.triggerFuture = scheduleOperationWithTrigger(this::evaluateChangeEvent); + } } @Override public void onTrigger() { - runInContextMainThread( - () -> { - if (!this.triggerFuture.isDone()) { - this.triggerFuture.complete(null); - LOG.debug( - "A rescale trigger event was observed causing the rescale verification logic to be initiated."); - } else { - LOG.debug( - "A rescale trigger event was observed outside of a rescale cycle. No action taken."); - } - }); - } - - /** - * Runs the {@code callback} in the context's main thread by scheduling the operation with no - * delay. This method should be used for internal state changes that might be triggered from - * outside the context's main thread. - */ - private void runInContextMainThread(Runnable callback) { - rescaleContext.scheduleOperation(callback, Duration.ZERO); + if (!this.triggerFuture.isDone()) { + this.triggerFuture.complete(null); + LOG.debug( + "A rescale trigger event was observed causing the rescale verification logic to be initiated."); + } else { + LOG.debug( + "A rescale trigger event was observed outside of a rescale cycle. No action taken."); + } } private void evaluateChangeEvent() { diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/Executing.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/Executing.java index 80ee29af6ad92b..1fcd23884f5a1c 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/Executing.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/Executing.java @@ -22,6 +22,7 @@ import org.apache.flink.core.execution.SavepointFormatType; import org.apache.flink.runtime.JobException; import org.apache.flink.runtime.checkpoint.CheckpointScheduling; +import org.apache.flink.runtime.checkpoint.CheckpointStatsListener; import org.apache.flink.runtime.checkpoint.CompletedCheckpoint; import org.apache.flink.runtime.execution.ExecutionState; import org.apache.flink.runtime.executiongraph.AccessExecutionGraph; @@ -52,17 +53,21 @@ import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** State which represents a running job with an {@link ExecutionGraph} and assigned slots. */ class Executing extends StateWithExecutionGraph - implements ResourceListener, RescaleManager.Context { + implements ResourceListener, RescaleManager.Context, CheckpointStatsListener { private final Context context; private final RescalingController sufficientResourcesController; private final RescalingController desiredResourcesController; private final RescaleManager rescaleManager; + private final int rescaleOnFailedCheckpointCount; + // null indicates that there was no change event observed, yet + @Nullable private AtomicInteger failedCheckpointCountdown; Executing( ExecutionGraph executionGraph, @@ -74,6 +79,7 @@ class Executing extends StateWithExecutionGraph List failureCollection, RescaleManager.Factory rescaleManagerFactory, int minParallelismChangeForRescale, + int rescaleOnFailedCheckpointCount, Instant lastRescale) { super( context, @@ -92,10 +98,22 @@ class Executing extends StateWithExecutionGraph new EnforceMinimalIncreaseRescalingController(minParallelismChangeForRescale); this.rescaleManager = rescaleManagerFactory.create(this, lastRescale); + Preconditions.checkArgument( + rescaleOnFailedCheckpointCount > 0, + "The rescaleOnFailedCheckpointCount should be larger than 0."); + this.rescaleOnFailedCheckpointCount = rescaleOnFailedCheckpointCount; + this.failedCheckpointCountdown = null; + deploy(); // check if new resources have come available in the meantime - context.runIfState(this, this::evaluateRescaling, Duration.ZERO); + context.runIfState( + this, + () -> { + rescaleManager.onChange(); + rescaleManager.onTrigger(); + }, + Duration.ZERO); } @Override @@ -194,17 +212,38 @@ private void handleDeploymentFailure(ExecutionVertex executionVertex, JobExcepti @Override public void onNewResourcesAvailable() { - evaluateRescaling(); + rescaleManager.onChange(); + initializeFailedCheckpointCountdownIfUnset(); } @Override public void onNewResourceRequirements() { - evaluateRescaling(); + rescaleManager.onChange(); + initializeFailedCheckpointCountdownIfUnset(); } - private void evaluateRescaling() { - rescaleManager.onChange(); + @Override + public void onCompletedCheckpoint() { + triggerPotentialRescale(); + } + + @Override + public void onFailedCheckpoint() { + if (this.failedCheckpointCountdown != null + && this.failedCheckpointCountdown.decrementAndGet() <= 0) { + triggerPotentialRescale(); + } + } + + private void triggerPotentialRescale() { rescaleManager.onTrigger(); + this.failedCheckpointCountdown = null; + } + + private void initializeFailedCheckpointCountdownIfUnset() { + if (failedCheckpointCountdown == null) { + this.failedCheckpointCountdown = new AtomicInteger(this.rescaleOnFailedCheckpointCount); + } } CompletableFuture stopWithSavepoint( @@ -285,6 +324,7 @@ static class Factory implements StateFactory { private final List failureCollection; private final RescaleManager.Factory rescaleManagerFactory; private final int minParallelismChangeForRescale; + private final int rescaleOnFailedCheckpointCount; Factory( ExecutionGraph executionGraph, @@ -295,7 +335,8 @@ static class Factory implements StateFactory { ClassLoader userCodeClassLoader, List failureCollection, RescaleManager.Factory rescaleManagerFactory, - int minParallelismChangeForRescale) { + int minParallelismChangeForRescale, + int rescaleOnFailedCheckpointCount) { this.context = context; this.log = log; this.executionGraph = executionGraph; @@ -305,6 +346,7 @@ static class Factory implements StateFactory { this.failureCollection = failureCollection; this.rescaleManagerFactory = rescaleManagerFactory; this.minParallelismChangeForRescale = minParallelismChangeForRescale; + this.rescaleOnFailedCheckpointCount = rescaleOnFailedCheckpointCount; } public Class getStateClass() { @@ -322,6 +364,7 @@ public Executing getState() { failureCollection, rescaleManagerFactory, minParallelismChangeForRescale, + rescaleOnFailedCheckpointCount, Instant.now()); } } diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/State.java b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/State.java index 4a815642d1c5cf..2a4bb9660421b0 100644 --- a/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/State.java +++ b/flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/adaptive/State.java @@ -26,6 +26,7 @@ import org.slf4j.Logger; import java.util.Optional; +import java.util.function.Consumer; /** * State abstraction of the {@link AdaptiveScheduler}. This interface contains all methods every @@ -100,17 +101,38 @@ default Optional as(Class clazz) { */ default void tryRun( Class clazz, ThrowingConsumer action, String debugMessage) throws E { + tryRun( + clazz, + action, + logger -> + logger.debug( + "Cannot run '{}' because the actual state is {} and not {}.", + debugMessage, + this.getClass().getSimpleName(), + clazz.getSimpleName())); + } + + /** + * Tries to run the action if this state is of type clazz. + * + * @param clazz clazz describes the target type + * @param action action to run if this state is of the target type + * @param invalidStateCallback that is called if the state isn't matching the expected one. + * @param target type + * @param error type + * @throws E an exception if the action fails + */ + default void tryRun( + Class clazz, + ThrowingConsumer action, + Consumer invalidStateCallback) + throws E { final Optional asOptional = as(clazz); if (asOptional.isPresent()) { action.accept(asOptional.get()); } else { - getLogger() - .debug( - "Cannot run '{}' because the actual state is {} and not {}.", - debugMessage, - this.getClass().getSimpleName(), - clazz.getSimpleName()); + invalidStateCallback.accept(getLogger()); } } diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/checkpoint/DefaultCheckpointStatsTrackerTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/checkpoint/DefaultCheckpointStatsTrackerTest.java index 42555be1701239..ad1571925db777 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/checkpoint/DefaultCheckpointStatsTrackerTest.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/checkpoint/DefaultCheckpointStatsTrackerTest.java @@ -46,6 +46,8 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; import java.util.stream.Collectors; import static java.util.Collections.singletonMap; @@ -239,6 +241,71 @@ void testCheckpointTracking() throws Exception { assertThat(snapshot.getLatestRestoredCheckpoint()).isEqualTo(restored); } + @Test + void testCheckpointStatsListenerOnCompletedCheckpoint() { + testCheckpointStatsListener( + (checkpointStatsTracker, pendingCheckpointStats) -> + checkpointStatsTracker.reportCompletedCheckpoint( + pendingCheckpointStats.toCompletedCheckpointStats( + "random-external-pointer")), + 1, + 0); + } + + @Test + void testCheckpointStatsListenerOnFailedCheckpoint() { + testCheckpointStatsListener( + (checkpointStatsTracker, pendingCheckpointStats) -> + checkpointStatsTracker.reportFailedCheckpoint( + pendingCheckpointStats.toFailedCheckpoint( + System.currentTimeMillis(), null)), + 0, + 1); + } + + private void testCheckpointStatsListener( + BiConsumer testCodeCallback, + int expectedOnCompletedCheckpointCount, + int expectedOnFailedCheckpointCount) { + final AtomicInteger onCompletedCheckpointCount = new AtomicInteger(); + final AtomicInteger onFailedCheckpointCount = new AtomicInteger(); + final CheckpointStatsListener listener = + new CheckpointStatsListener() { + @Override + public void onCompletedCheckpoint() { + onCompletedCheckpointCount.incrementAndGet(); + } + + @Override + public void onFailedCheckpoint() { + onFailedCheckpointCount.incrementAndGet(); + } + }; + + final CheckpointStatsTracker statsTracker = + new DefaultCheckpointStatsTracker( + 10, + UnregisteredMetricGroups.createUnregisteredJobManagerJobMetricGroup(), + listener); + + // "factory" code to enable the instantiation of test data based on a PendingCheckpointStats + // instance + final JobVertexID jobVertexID = new JobVertexID(); + final PendingCheckpointStats pending = + statsTracker.reportPendingCheckpoint( + 0, + 1, + CheckpointProperties.forCheckpoint( + CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), + singletonMap(jobVertexID, 1)); + pending.reportSubtaskStats(jobVertexID, createSubtaskStats(0)); + + testCodeCallback.accept(statsTracker, pending); + + assertThat(onCompletedCheckpointCount).hasValue(expectedOnCompletedCheckpointCount); + assertThat(onFailedCheckpointCount).hasValue(expectedOnFailedCheckpointCount); + } + /** Tests that snapshots are only created if a new snapshot has been reported or updated. */ @Test void testCreateSnapshot() { diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerBuilder.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerBuilder.java index fca3c7a854833d..42f50616490b2b 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerBuilder.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerBuilder.java @@ -23,7 +23,10 @@ import org.apache.flink.runtime.blob.BlobWriter; import org.apache.flink.runtime.blob.VoidBlobWriter; import org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory; +import org.apache.flink.runtime.checkpoint.CheckpointStatsListener; +import org.apache.flink.runtime.checkpoint.CheckpointStatsTracker; import org.apache.flink.runtime.checkpoint.CheckpointsCleaner; +import org.apache.flink.runtime.checkpoint.DefaultCheckpointStatsTracker; import org.apache.flink.runtime.checkpoint.StandaloneCheckpointRecoveryFactory; import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor; import org.apache.flink.runtime.executiongraph.JobStatusListener; @@ -52,6 +55,7 @@ import java.util.Collection; import java.util.Collections; import java.util.concurrent.ScheduledExecutorService; +import java.util.function.BiFunction; import java.util.function.Function; /** Builder for {@link AdaptiveScheduler}. */ @@ -89,6 +93,17 @@ public class AdaptiveSchedulerBuilder { @Nullable private SlotAllocator slotAllocator; + /** + * {@code null} indicates that the default factory will be used based on the set configuration. + */ + @Nullable private RescaleManager.Factory rescaleManagerFactory = null; + + private BiFunction + checkpointStatsTrackerFactory = + (metricGroup, checkpointStatsListener) -> + new DefaultCheckpointStatsTracker( + 10, metricGroup, checkpointStatsListener); + public AdaptiveSchedulerBuilder( final JobGraph jobGraph, ComponentMainThreadExecutor mainThreadExecutor, @@ -206,6 +221,23 @@ public AdaptiveSchedulerBuilder setSlotAllocator(SlotAllocator slotAllocator) { return this; } + public AdaptiveSchedulerBuilder setRescaleManagerFactory( + @Nullable RescaleManager.Factory rescaleManagerFactory) { + this.rescaleManagerFactory = rescaleManagerFactory; + return this; + } + + public AdaptiveSchedulerBuilder setCheckpointStatsTrackerFactory( + @Nullable + BiFunction< + JobManagerJobMetricGroup, + CheckpointStatsListener, + CheckpointStatsTracker> + checkpointStatsTrackerFactory) { + this.checkpointStatsTrackerFactory = checkpointStatsTrackerFactory; + return this; + } + public AdaptiveScheduler build() throws Exception { final ExecutionGraphFactory executionGraphFactory = new DefaultExecutionGraphFactory( @@ -220,8 +252,14 @@ public AdaptiveScheduler build() throws Exception { shuffleMaster, partitionTracker); + final AdaptiveScheduler.Settings settings = + AdaptiveScheduler.Settings.of(jobMasterConfiguration); return new AdaptiveScheduler( - AdaptiveScheduler.Settings.of(jobMasterConfiguration), + settings, + rescaleManagerFactory == null + ? DefaultRescaleManager.Factory.fromSettings(settings) + : rescaleManagerFactory, + checkpointStatsTrackerFactory, jobGraph, jobResourceRequirements, jobMasterConfiguration, diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerTest.java index 6fb3d0e017f85c..d1bde31da6c2f9 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerTest.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/AdaptiveSchedulerTest.java @@ -33,8 +33,10 @@ import org.apache.flink.runtime.checkpoint.CheckpointException; import org.apache.flink.runtime.checkpoint.CheckpointFailureReason; import org.apache.flink.runtime.checkpoint.CheckpointIDCounter; +import org.apache.flink.runtime.checkpoint.CheckpointStatsListener; import org.apache.flink.runtime.checkpoint.CheckpointsCleaner; import org.apache.flink.runtime.checkpoint.CompletedCheckpointStore; +import org.apache.flink.runtime.checkpoint.NoOpCheckpointStatsTracker; import org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter; import org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore; import org.apache.flink.runtime.checkpoint.TestingCheckpointIDCounter; @@ -64,6 +66,7 @@ import org.apache.flink.runtime.io.network.partition.ResultPartitionID; import org.apache.flink.runtime.jobgraph.IntermediateDataSetID; import org.apache.flink.runtime.jobgraph.JobGraph; +import org.apache.flink.runtime.jobgraph.JobGraphBuilder; import org.apache.flink.runtime.jobgraph.JobResourceRequirements; import org.apache.flink.runtime.jobgraph.JobVertex; import org.apache.flink.runtime.jobgraph.JobVertexResourceRequirements; @@ -75,6 +78,8 @@ import org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPool; import org.apache.flink.runtime.jobmaster.slotpool.DefaultAllocatedSlotPool; import org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool; +import org.apache.flink.runtime.jobmaster.slotpool.TestingDeclarativeSlotPoolBuilder; +import org.apache.flink.runtime.jobmaster.slotpool.TestingFreeSlotInfoTracker; import org.apache.flink.runtime.messages.Acknowledge; import org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint; import org.apache.flink.runtime.metrics.MetricNames; @@ -89,6 +94,8 @@ import org.apache.flink.runtime.scheduler.DefaultSchedulerTest; import org.apache.flink.runtime.scheduler.SchedulerBase; import org.apache.flink.runtime.scheduler.SchedulerNG; +import org.apache.flink.runtime.scheduler.SchedulerTestingUtils; +import org.apache.flink.runtime.scheduler.TestingPhysicalSlot; import org.apache.flink.runtime.scheduler.VertexParallelismInformation; import org.apache.flink.runtime.scheduler.VertexParallelismStore; import org.apache.flink.runtime.scheduler.adaptive.allocator.TestSlotInfo; @@ -106,6 +113,7 @@ import org.apache.flink.testutils.executor.TestExecutorExtension; import org.apache.flink.traces.Span; import org.apache.flink.traces.SpanBuilder; +import org.apache.flink.util.ConfigurationException; import org.apache.flink.util.FlinkException; import org.apache.flink.util.IterableUtils; import org.apache.flink.util.Preconditions; @@ -141,6 +149,7 @@ import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.stream.Collectors; +import java.util.stream.IntStream; import static org.apache.flink.core.testutils.FlinkAssertions.assertThatFuture; import static org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.createExecutionAttemptId; @@ -150,6 +159,7 @@ import static org.apache.flink.runtime.jobmaster.slotpool.SlotPoolTestUtils.offerSlots; import static org.apache.flink.runtime.scheduler.SchedulerTestingUtils.enableCheckpointing; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; import static org.assertj.core.api.Assertions.assertThatThrownBy; /** Tests for the {@link AdaptiveScheduler}. */ @@ -1280,7 +1290,8 @@ private static Configuration createConfigurationWithNoTimeouts() { return new Configuration() .set(JobManagerOptions.RESOURCE_WAIT_TIMEOUT, Duration.ofMillis(-1L)) .set(JobManagerOptions.RESOURCE_STABILIZATION_TIMEOUT, Duration.ofMillis(1L)) - .set(JobManagerOptions.SCHEDULER_SCALING_INTERVAL_MIN, Duration.ofMillis(1L)); + .set(JobManagerOptions.SCHEDULER_SCALING_INTERVAL_MIN, Duration.ofMillis(1L)) + .set(JobManagerOptions.MAXIMUM_DELAY_FOR_SCALE_TRIGGER, Duration.ZERO); } private AdaptiveSchedulerBuilder prepareSchedulerWithNoTimeouts( @@ -2112,7 +2123,7 @@ void testRequestUpdatedResourceRequirements() throws Exception { } @Test - public void testScalingIntervalConfigurationIsRespected() { + void testScalingIntervalConfigurationIsRespected() throws ConfigurationException { final Duration scalingIntervalMin = Duration.ofMillis(1337); final Duration scalingIntervalMax = Duration.ofMillis(7331); final Configuration configuration = createConfigurationWithNoTimeouts(); @@ -2124,10 +2135,151 @@ public void testScalingIntervalConfigurationIsRespected() { assertThat(settings.getScalingIntervalMax()).isEqualTo(scalingIntervalMax); } + @Test + void testOnCompletedCheckpointIsHandledInMainThread() throws Exception { + testCheckpointStatsEventBeingExecutedInTheMainThread( + CheckpointStatsListener::onCompletedCheckpoint, 1, Integer.MAX_VALUE); + } + + @Test + void testOnFailedCheckpointIsHandledInMainThread() throws Exception { + testCheckpointStatsEventBeingExecutedInTheMainThread( + CheckpointStatsListener::onFailedCheckpoint, 2, 2); + } + + private void testCheckpointStatsEventBeingExecutedInTheMainThread( + Consumer eventCallback, + int eventRepetitions, + int triggerOnFailedCheckpointCount) + throws Exception { + + final CompletableFuture statsListenerInstantiatedFuture = + new CompletableFuture<>(); + final BlockingQueue eventQueue = new ArrayBlockingQueue<>(1); + + final AdaptiveScheduler testInstance = + createSchedulerThatReachesExecutingState( + PARALLELISM, + triggerOnFailedCheckpointCount, + eventQueue, + statsListenerInstantiatedFuture); + + try { + // start scheduling to reach Executing state + singleThreadMainThreadExecutor.execute(testInstance::startScheduling); + + final CheckpointStatsListener statsListener = statsListenerInstantiatedFuture.get(); + assertThat(statsListener) + .as("The CheckpointStatsListener should have been instantiated.") + .isNotNull(); + + // the first trigger happens in the Executing initialization - let's wait for that event + // to pass + assertThat(eventQueue.take()) + .as( + "The first event should have been appeared during Executing state initialization and should be ignored.") + .isEqualTo(0); + + // counting the failed checkpoints only starts on a change event + testInstance.updateJobResourceRequirements( + JobResourceRequirements.newBuilder() + .setParallelismForJobVertex(JOB_VERTEX.getID(), 1, PARALLELISM - 1) + .build()); + + for (int i = 0; i < eventRepetitions; i++) { + assertThatNoException() + .as( + "Triggering the event from outside the main thread should not have caused an error.") + .isThrownBy(() -> eventCallback.accept(statsListener)); + } + + assertThat(eventQueue.take()) + .as("Only one event should have been observed.") + .isEqualTo(1); + } finally { + final CompletableFuture closeFuture = new CompletableFuture<>(); + singleThreadMainThreadExecutor.execute( + () -> FutureUtils.forward(testInstance.closeAsync(), closeFuture)); + assertThatFuture(closeFuture).eventuallySucceeds(); + } + } + // --------------------------------------------------------------------------------------------- // Utils // --------------------------------------------------------------------------------------------- + private AdaptiveScheduler createSchedulerThatReachesExecutingState( + int parallelism, + int onFailedCheckpointCount, + BlockingQueue eventQueue, + CompletableFuture statsListenerInstantiatedFuture) + throws Exception { + final Configuration config = new Configuration(); + config.set( + JobManagerOptions.SCHEDULER_SCALE_ON_FAILED_CHECKPOINTS_COUNT, + onFailedCheckpointCount); + + final JobGraph jobGraph = + JobGraphBuilder.newStreamingJobGraphBuilder() + .addJobVertices(Collections.singletonList(JOB_VERTEX)) + .setJobCheckpointingSettings( + new JobCheckpointingSettings( + new CheckpointCoordinatorConfiguration + .CheckpointCoordinatorConfigurationBuilder() + .build(), + null)) + .build(); + SchedulerTestingUtils.enableCheckpointing(jobGraph); + + // testing SlotPool instance that would allow for the scheduler to transition to Executing + // state + final DeclarativeSlotPool slotPool = + new TestingDeclarativeSlotPoolBuilder() + .setContainsFreeSlotFunction(allocationID -> true) + .setReserveFreeSlotFunction( + (allocationId, resourceProfile) -> + TestingPhysicalSlot.builder() + .withAllocationID(allocationId) + .build()) + .setGetFreeSlotInfoTrackerSupplier( + () -> + TestingFreeSlotInfoTracker.newBuilder() + .setGetFreeSlotsInformationSupplier( + () -> + IntStream.range(0, parallelism) + .mapToObj( + v -> + new TestSlotInfo()) + .collect( + Collectors.toSet())) + .build()) + .build(); + + final AtomicInteger eventCounter = new AtomicInteger(); + return new AdaptiveSchedulerBuilder( + jobGraph, singleThreadMainThreadExecutor, EXECUTOR_RESOURCE.getExecutor()) + .setJobMasterConfiguration(config) + .setDeclarativeSlotPool(slotPool) + .setRescaleManagerFactory( + new TestingRescaleManager.Factory( + () -> {}, + () -> { + singleThreadMainThreadExecutor.assertRunningInMainThread(); + + eventQueue.offer(eventCounter.getAndIncrement()); + })) + .setCheckpointStatsTrackerFactory( + (metricGroup, listener) -> { + assertThat(statsListenerInstantiatedFuture) + .as( + "The CheckpointStatsListener should be only instantiated once.") + .isNotCompleted(); + statsListenerInstantiatedFuture.complete(listener); + return NoOpCheckpointStatsTracker.INSTANCE; + }) + .build(); + } + private CompletableFuture getArchivedExecutionGraphForRunningJob( SchedulerNG scheduler) { return CompletableFuture.supplyAsync( diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/DefaultRescaleManagerTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/DefaultRescaleManagerTest.java index c24f5ed8c80b67..94e6a57ea093ae 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/DefaultRescaleManagerTest.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/DefaultRescaleManagerTest.java @@ -20,6 +20,7 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.util.ConfigurationException; import org.junit.jupiter.api.Test; @@ -39,7 +40,7 @@ class DefaultRescaleManagerTest { @Test - void testProperConfiguration() { + void testProperConfiguration() throws ConfigurationException { final Duration scalingIntervalMin = Duration.ofMillis(1337); final Duration scalingIntervalMax = Duration.ofMillis(7331); final Duration maximumDelayForRescaleTrigger = Duration.ofMillis(4242); diff --git a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/ExecutingTest.java b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/ExecutingTest.java index e3a9870f9328d3..48ee70bfd297f7 100644 --- a/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/ExecutingTest.java +++ b/flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/adaptive/ExecutingTest.java @@ -97,9 +97,12 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Supplier; +import java.util.stream.IntStream; import static org.apache.flink.runtime.scheduler.adaptive.WaitingForResourcesTest.assertNonNull; import static org.assertj.core.api.Assertions.assertThat; @@ -154,6 +157,7 @@ void testNoDeploymentCallOnEnterWhenVertexRunning() throws Exception { new ArrayList<>(), TestingRescaleManager.Factory.noOpFactory(), 1, + 1, Instant.now()); assertThat(mockExecutionVertex.isDeployCalled()).isFalse(); } @@ -181,12 +185,124 @@ void testIllegalStateExceptionOnNotRunningExecutionGraph() { new ArrayList<>(), TestingRescaleManager.Factory.noOpFactory(), 1, + 1, Instant.now()); } }) .isInstanceOf(IllegalStateException.class); } + @Test + public void testTriggerRescaleOnCompletedCheckpoint() throws Exception { + final AtomicBoolean rescaleTriggered = new AtomicBoolean(); + final RescaleManager.Factory rescaleManagerFactory = + new TestingRescaleManager.Factory(() -> {}, () -> rescaleTriggered.set(true)); + try (MockExecutingContext ctx = new MockExecutingContext()) { + final Executing testInstance = + new ExecutingStateBuilder() + .setRescaleManagerFactory(rescaleManagerFactory) + .build(ctx); + + assertThat(rescaleTriggered).isFalse(); + testInstance.onCompletedCheckpoint(); + assertThat(rescaleTriggered).isTrue(); + } + } + + @Test + public void testTriggerRescaleOnFailedCheckpoint() throws Exception { + final AtomicInteger rescaleTriggerCount = new AtomicInteger(); + final RescaleManager.Factory rescaleManagerFactory = + new TestingRescaleManager.Factory(() -> {}, rescaleTriggerCount::incrementAndGet); + final int rescaleOnFailedCheckpointsCount = 3; + try (MockExecutingContext ctx = new MockExecutingContext()) { + final Executing testInstance = + new ExecutingStateBuilder() + .setRescaleManagerFactory(rescaleManagerFactory) + .setRescaleOnFailedCheckpointCount(rescaleOnFailedCheckpointsCount) + .build(ctx); + + // do multiple rescale iterations to verify that subsequent failed checkpoints after a + // rescale result in the expected behavior + for (int rescaleIteration = 1; rescaleIteration <= 3; rescaleIteration++) { + + // trigger an initial failed checkpoint event to show that the counting only starts + // with the subsequent change event + testInstance.onFailedCheckpoint(); + + // trigger change + testInstance.onNewResourceRequirements(); + + for (int i = 0; i < rescaleOnFailedCheckpointsCount; i++) { + assertThat(rescaleTriggerCount) + .as( + "No rescale operation should have been triggered for iteration #%d, yet.", + rescaleIteration) + .hasValue(rescaleIteration - 1); + testInstance.onFailedCheckpoint(); + } + + assertThat(rescaleTriggerCount) + .as( + "The rescale operation for iteration #%d should have been properly triggered.", + rescaleIteration) + .hasValue(rescaleIteration); + } + } + } + + @Test + public void testOnCompletedCheckpointResetsFailedCheckpointCount() throws Exception { + final AtomicInteger rescaleTriggeredCount = new AtomicInteger(); + final RescaleManager.Factory rescaleManagerFactory = + new TestingRescaleManager.Factory(() -> {}, rescaleTriggeredCount::incrementAndGet); + final int rescaleOnFailedCheckpointsCount = 3; + try (MockExecutingContext ctx = new MockExecutingContext()) { + final Executing testInstance = + new ExecutingStateBuilder() + .setRescaleManagerFactory(rescaleManagerFactory) + .setRescaleOnFailedCheckpointCount(rescaleOnFailedCheckpointsCount) + .build(ctx); + + // trigger an initial failed checkpoint event to show that the counting only starts with + // the subsequent change event + testInstance.onFailedCheckpoint(); + + // trigger change + testInstance.onNewResourcesAvailable(); + + IntStream.range(0, rescaleOnFailedCheckpointsCount - 1) + .forEach(ignored -> testInstance.onFailedCheckpoint()); + + assertThat(rescaleTriggeredCount) + .as("No rescaling should have been trigger, yet.") + .hasValue(0); + + testInstance.onCompletedCheckpoint(); + + // trigger change + testInstance.onNewResourceRequirements(); + + assertThat(rescaleTriggeredCount) + .as("The completed checkpoint should have triggered a rescale.") + .hasValue(1); + + IntStream.range(0, rescaleOnFailedCheckpointsCount - 1) + .forEach(ignored -> testInstance.onFailedCheckpoint()); + + assertThat(rescaleTriggeredCount) + .as( + "No additional rescaling should have been trigger by any subsequent failed checkpoint, yet.") + .hasValue(1); + + testInstance.onFailedCheckpoint(); + + assertThat(rescaleTriggeredCount) + .as("The previous failed checkpoint should have triggered the rescale.") + .hasValue(2); + } + } + @Test void testDisposalOfOperatorCoordinatorsOnLeaveOfStateWithExecutionGraph() throws Exception { try (MockExecutingContext ctx = new MockExecutingContext()) { @@ -490,8 +606,9 @@ private final class ExecutingStateBuilder { TestingDefaultExecutionGraphBuilder.newBuilder() .build(EXECUTOR_EXTENSION.getExecutor()); private OperatorCoordinatorHandler operatorCoordinatorHandler; - private TestingRescaleManager.Factory rescaleManagerFactory = + private RescaleManager.Factory rescaleManagerFactory = TestingRescaleManager.Factory.noOpFactory(); + private int rescaleOnFailedCheckpointCount = 1; private ExecutingStateBuilder() throws JobException, JobExecutionException { operatorCoordinatorHandler = new TestingOperatorCoordinatorHandler(); @@ -509,11 +626,17 @@ public ExecutingStateBuilder setOperatorCoordinatorHandler( } public ExecutingStateBuilder setRescaleManagerFactory( - TestingRescaleManager.Factory rescaleManagerFactory) { + RescaleManager.Factory rescaleManagerFactory) { this.rescaleManagerFactory = rescaleManagerFactory; return this; } + public ExecutingStateBuilder setRescaleOnFailedCheckpointCount( + int rescaleOnFailedCheckpointCount) { + this.rescaleOnFailedCheckpointCount = rescaleOnFailedCheckpointCount; + return this; + } + private Executing build(MockExecutingContext ctx) { executionGraph.transitionToRunning(); @@ -528,6 +651,7 @@ private Executing build(MockExecutingContext ctx) { new ArrayList<>(), rescaleManagerFactory, 1, + rescaleOnFailedCheckpointCount, // will be ignored by the TestingRescaleManager.Factory Instant.now()); } finally { diff --git a/flink-tests/src/test/java/org/apache/flink/test/scheduling/RescaleOnCheckpointITCase.java b/flink-tests/src/test/java/org/apache/flink/test/scheduling/RescaleOnCheckpointITCase.java new file mode 100644 index 00000000000000..60c2b19cca1056 --- /dev/null +++ b/flink-tests/src/test/java/org/apache/flink/test/scheduling/RescaleOnCheckpointITCase.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.test.scheduling; + +import org.apache.flink.api.common.JobID; +import org.apache.flink.client.program.rest.RestClusterClient; +import org.apache.flink.configuration.CheckpointingOptions; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.JobManagerOptions; +import org.apache.flink.configuration.WebOptions; +import org.apache.flink.runtime.jobgraph.JobGraph; +import org.apache.flink.runtime.jobgraph.JobResourceRequirements; +import org.apache.flink.runtime.jobgraph.JobVertex; +import org.apache.flink.runtime.minicluster.MiniCluster; +import org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.functions.sink.v2.DiscardingSink; +import org.apache.flink.test.junit5.InjectClusterClient; +import org.apache.flink.test.junit5.InjectMiniCluster; +import org.apache.flink.test.junit5.MiniClusterExtension; +import org.apache.flink.testutils.TestingUtils; +import org.apache.flink.util.TestLoggerExtension; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.extension.RegisterExtension; + +import java.time.Duration; +import java.util.Iterator; + +import static org.apache.flink.test.scheduling.UpdateJobResourceRequirementsITCase.waitForAvailableSlots; +import static org.apache.flink.test.scheduling.UpdateJobResourceRequirementsITCase.waitForRunningTasks; +import static org.assertj.core.api.Assertions.assertThat; + +@ExtendWith(TestLoggerExtension.class) +class RescaleOnCheckpointITCase { + + // Scaling down is used here because scaling up is not supported by the NumberSequenceSource + // that's used in this test. + private static final int NUMBER_OF_SLOTS = 4; + private static final int BEFORE_RESCALE_PARALLELISM = NUMBER_OF_SLOTS; + private static final int AFTER_RESCALE_PARALLELISM = NUMBER_OF_SLOTS - 1; + + // This timeout is used to wait for any possible rescale after the JobRequirement + // update (which shouldn't happen). A longer gap makes the test more reliable (it's hard to test + // that something didn't happen) but also increases the runtime of the test. + private static final Duration REQUIREMENT_UPDATE_TO_CHECKPOINT_GAP = Duration.ofSeconds(2); + + @RegisterExtension + private static final MiniClusterExtension MINI_CLUSTER_EXTENSION = + new MiniClusterExtension( + new MiniClusterResourceConfiguration.Builder() + .setConfiguration(createConfiguration()) + .setNumberSlotsPerTaskManager(NUMBER_OF_SLOTS) + .build()); + + private static Configuration createConfiguration() { + final Configuration configuration = new Configuration(); + configuration.set(JobManagerOptions.SCHEDULER, JobManagerOptions.SchedulerType.Adaptive); + + // speed the test suite up + // - lower refresh interval -> controls how fast we invalidate ExecutionGraphCache + // - lower slot idle timeout -> controls how fast we return idle slots to TM + configuration.set(WebOptions.REFRESH_INTERVAL, Duration.ofMillis(50L)); + configuration.set(JobManagerOptions.SLOT_IDLE_TIMEOUT, Duration.ofMillis(50L)); + + // no checkpoints shall be triggered by Flink itself + configuration.set( + CheckpointingOptions.CHECKPOINTING_INTERVAL, TestingUtils.infiniteDuration()); + + // rescale shouldn't be triggered due to the timeout + configuration.set( + JobManagerOptions.MAXIMUM_DELAY_FOR_SCALE_TRIGGER, TestingUtils.infiniteDuration()); + + // no cooldown to avoid delaying the test even more + configuration.set(JobManagerOptions.SCHEDULER_SCALING_INTERVAL_MIN, Duration.ZERO); + + return configuration; + } + + @Test + void testRescaleOnCheckpoint( + @InjectMiniCluster MiniCluster miniCluster, + @InjectClusterClient RestClusterClient restClusterClient) + throws Exception { + final Configuration config = new Configuration(); + + final StreamExecutionEnvironment env = + StreamExecutionEnvironment.getExecutionEnvironment(config); + env.setParallelism(BEFORE_RESCALE_PARALLELISM); + env.fromSequence(0, Integer.MAX_VALUE).sinkTo(new DiscardingSink<>()); + + final JobGraph jobGraph = env.getStreamGraph().getJobGraph(); + final Iterator jobVertexIterator = jobGraph.getVertices().iterator(); + assertThat(jobVertexIterator.hasNext()) + .as("There needs to be at least one JobVertex.") + .isTrue(); + final JobResourceRequirements jobResourceRequirements = + JobResourceRequirements.newBuilder() + .setParallelismForJobVertex( + jobVertexIterator.next().getID(), 1, AFTER_RESCALE_PARALLELISM) + .build(); + assertThat(jobVertexIterator.hasNext()) + .as("This test expects to have only one JobVertex.") + .isFalse(); + + restClusterClient.submitJob(jobGraph).join(); + try { + final JobID jobId = jobGraph.getJobID(); + + waitForRunningTasks(restClusterClient, jobId, BEFORE_RESCALE_PARALLELISM); + + restClusterClient.updateJobResourceRequirements(jobId, jobResourceRequirements).join(); + + // timeout to allow any unexpected rescaling to happen anyway + Thread.sleep(REQUIREMENT_UPDATE_TO_CHECKPOINT_GAP.toMillis()); + + // verify that the previous timeout didn't result in a change of parallelism + waitForRunningTasks(restClusterClient, jobId, BEFORE_RESCALE_PARALLELISM); + + miniCluster.triggerCheckpoint(jobId); + + waitForRunningTasks(restClusterClient, jobId, AFTER_RESCALE_PARALLELISM); + + waitForAvailableSlots(restClusterClient, NUMBER_OF_SLOTS - AFTER_RESCALE_PARALLELISM); + } finally { + restClusterClient.cancel(jobGraph.getJobID()).join(); + } + } +}