[FLINK-35553][runtime] Wire-up RescaleManager with CheckpointLifecycl…

…eListener in Executing state
XComp · Jul 2, 2024 · 38738d9 · 38738d9
1 parent 4eab93e
commit 38738d9
Show file tree

Hide file tree

Showing 16 changed files with 864 additions and 65 deletions.
diff --git a/docs/layouts/shortcodes/generated/all_jobmanager_section.html b/docs/layouts/shortcodes/generated/all_jobmanager_section.html
@@ -12,7 +12,7 @@
             <td><h5>jobmanager.adaptive-scheduler.max-delay-for-scale-trigger</h5></td>
             <td style="word-wrap: break-word;">(none)</td>
             <td>Duration</td>
-            <td>The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and %dx of the checkpointing interval if checkpointing is enabled).</td>
+            <td>The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and the checkpointing interval multiplied by the by-1-incremented parameter value of jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count if checkpointing is enabled).</td>
         </tr>
         <tr>
             <td><h5>jobmanager.adaptive-scheduler.min-parallelism-increase</h5></td>
@@ -32,6 +32,12 @@
             <td>Duration</td>
             <td>The maximum time the JobManager will wait to acquire all required resources after a job submission or restart. Once elapsed it will try to run the job with a lower parallelism, or fail if the minimum amount of resources could not be acquired.<br />Increasing this value will make the cluster more resilient against temporary resources shortages (e.g., there is more time for a failed TaskManager to be restarted).<br />Setting a negative duration will disable the resource timeout: The JobManager will wait indefinitely for resources to appear.<br />If <code class="highlighter-rouge">scheduler-mode</code> is configured to <code class="highlighter-rouge">REACTIVE</code>, this configuration value will default to a negative value to disable the resource timeout.</td>
         </tr>
+        <tr>
+            <td><h5>jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count</h5></td>
+            <td style="word-wrap: break-word;">2</td>
+            <td>Integer</td>
+            <td>The number of consecutive failed checkpoints that will trigger rescaling even in the absence of a completed checkpoint.</td>
+        </tr>
         <tr>
             <td><h5>jobmanager.adaptive-scheduler.scaling-interval.max</h5></td>
             <td style="word-wrap: break-word;">(none)</td>

diff --git a/docs/layouts/shortcodes/generated/expert_scheduling_section.html b/docs/layouts/shortcodes/generated/expert_scheduling_section.html
@@ -90,7 +90,7 @@
             <td><h5>jobmanager.adaptive-scheduler.max-delay-for-scale-trigger</h5></td>
             <td style="word-wrap: break-word;">(none)</td>
             <td>Duration</td>
-            <td>The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and %dx of the checkpointing interval if checkpointing is enabled).</td>
+            <td>The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and the checkpointing interval multiplied by the by-1-incremented parameter value of jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count if checkpointing is enabled).</td>
         </tr>
         <tr>
             <td><h5>jobmanager.adaptive-scheduler.min-parallelism-increase</h5></td>
@@ -110,6 +110,12 @@
             <td>Duration</td>
             <td>The maximum time the JobManager will wait to acquire all required resources after a job submission or restart. Once elapsed it will try to run the job with a lower parallelism, or fail if the minimum amount of resources could not be acquired.<br />Increasing this value will make the cluster more resilient against temporary resources shortages (e.g., there is more time for a failed TaskManager to be restarted).<br />Setting a negative duration will disable the resource timeout: The JobManager will wait indefinitely for resources to appear.<br />If <code class="highlighter-rouge">scheduler-mode</code> is configured to <code class="highlighter-rouge">REACTIVE</code>, this configuration value will default to a negative value to disable the resource timeout.</td>
         </tr>
+        <tr>
+            <td><h5>jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count</h5></td>
+            <td style="word-wrap: break-word;">2</td>
+            <td>Integer</td>
+            <td>The number of consecutive failed checkpoints that will trigger rescaling even in the absence of a completed checkpoint.</td>
+        </tr>
         <tr>
             <td><h5>jobmanager.adaptive-scheduler.scaling-interval.max</h5></td>
             <td style="word-wrap: break-word;">(none)</td>

diff --git a/docs/layouts/shortcodes/generated/job_manager_configuration.html b/docs/layouts/shortcodes/generated/job_manager_configuration.html
@@ -12,7 +12,7 @@
             <td><h5>jobmanager.adaptive-scheduler.max-delay-for-scale-trigger</h5></td>
             <td style="word-wrap: break-word;">(none)</td>
             <td>Duration</td>
-            <td>The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and %dx of the checkpointing interval if checkpointing is enabled).</td>
+            <td>The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled and the checkpointing interval multiplied by the by-1-incremented parameter value of jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count if checkpointing is enabled).</td>
         </tr>
         <tr>
             <td><h5>jobmanager.adaptive-scheduler.min-parallelism-increase</h5></td>
@@ -32,6 +32,12 @@
             <td>Duration</td>
             <td>The maximum time the JobManager will wait to acquire all required resources after a job submission or restart. Once elapsed it will try to run the job with a lower parallelism, or fail if the minimum amount of resources could not be acquired.<br />Increasing this value will make the cluster more resilient against temporary resources shortages (e.g., there is more time for a failed TaskManager to be restarted).<br />Setting a negative duration will disable the resource timeout: The JobManager will wait indefinitely for resources to appear.<br />If <code class="highlighter-rouge">scheduler-mode</code> is configured to <code class="highlighter-rouge">REACTIVE</code>, this configuration value will default to a negative value to disable the resource timeout.</td>
         </tr>
+        <tr>
+            <td><h5>jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count</h5></td>
+            <td style="word-wrap: break-word;">2</td>
+            <td>Integer</td>
+            <td>The number of consecutive failed checkpoints that will trigger rescaling even in the absence of a completed checkpoint.</td>
+        </tr>
         <tr>
             <td><h5>jobmanager.adaptive-scheduler.scaling-interval.max</h5></td>
             <td style="word-wrap: break-word;">(none)</td>

diff --git a/flink-core/src/main/java/org/apache/flink/configuration/JobManagerOptions.java b/flink-core/src/main/java/org/apache/flink/configuration/JobManagerOptions.java
@@ -39,7 +39,6 @@
 public class JobManagerOptions {
 
     public static final MemorySize MIN_JVM_HEAP_SIZE = MemorySize.ofMebiBytes(128);
-    public static final int FACTOR_FOR_DEFAULT_MAXIMUM_DELAY_FOR_RESCALE_TRIGGER = 3;
 
     /**
      * The config parameter defining the network address to connect to for communication with the
@@ -574,6 +573,20 @@ public InlineElement getDescription() {
                                             code(SchedulerExecutionMode.REACTIVE.name()))
                                     .build());
 
+    @Documentation.Section({
+        Documentation.Sections.EXPERT_SCHEDULING,
+        Documentation.Sections.ALL_JOB_MANAGER
+    })
+    public static final ConfigOption<Integer> SCHEDULER_SCALE_ON_FAILED_CHECKPOINTS_COUNT =
+            key("jobmanager.adaptive-scheduler.scale-on-failed-checkpoints-count")
+                    .intType()
+                    .defaultValue(2)
+                    .withDescription(
+                            Description.builder()
+                                    .text(
+                                            "The number of consecutive failed checkpoints that will trigger rescaling even in the absence of a completed checkpoint.")
+                                    .build());
+
     @Documentation.Section({
         Documentation.Sections.EXPERT_SCHEDULING,
         Documentation.Sections.ALL_JOB_MANAGER
@@ -586,10 +599,8 @@ public InlineElement getDescription() {
                             Description.builder()
                                     .text(
                                             "The maximum time the JobManager will wait with evaluating previously observed events for rescaling (default: 0ms if checkpointing is disabled "
-                                                    + "and %dx of the checkpointing interval if checkpointing is enabled).",
-                                            text(
-                                                    String.valueOf(
-                                                            FACTOR_FOR_DEFAULT_MAXIMUM_DELAY_FOR_RESCALE_TRIGGER)))
+                                                    + "and the checkpointing interval multiplied by the by-1-incremented parameter value of %s if checkpointing is enabled).",
+                                            text(SCHEDULER_SCALE_ON_FAILED_CHECKPOINTS_COUNT.key()))
                                     .build());
 
     @Documentation.Section({

diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointStatsListener.java b/flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointStatsListener.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.checkpoint;
+
+/** An interface that allows listening on the checkpoint lifecycle. */
+public interface CheckpointStatsListener {
+
+    /** Called when a checkpoint was completed successfully. */
+    default void onCompletedCheckpoint() {
+        // No-op.
+    }
+
+    /** Called when a checkpoint failed. */
+    default void onFailedCheckpoint() {
+        // No-op.
+    }
+}
diff --git a/...time/src/main/java/org/apache/flink/runtime/checkpoint/DefaultCheckpointStatsTracker.java b/...time/src/main/java/org/apache/flink/runtime/checkpoint/DefaultCheckpointStatsTracker.java
@@ -73,6 +73,7 @@ public class DefaultCheckpointStatsTracker implements CheckpointStatsTracker {
 
     private Optional<JobInitializationMetricsBuilder> jobInitializationMetricsBuilder =
             Optional.empty();
+    @Nullable private final CheckpointStatsListener checkpointStatsListener;
 
     /** Latest created snapshot. */
     private volatile CheckpointStatsSnapshot latestSnapshot;
@@ -95,9 +96,25 @@ public class DefaultCheckpointStatsTracker implements CheckpointStatsTracker {
      */
     public DefaultCheckpointStatsTracker(
             int numRememberedCheckpoints, JobManagerJobMetricGroup metricGroup) {
+        this(numRememberedCheckpoints, metricGroup, null);
+    }
+
+    /**
+     * Creates a new checkpoint stats tracker.
+     *
+     * @param numRememberedCheckpoints Maximum number of checkpoints to remember, including in
+     *     progress ones.
+     * @param metricGroup Metric group for exposed metrics.
+     * @param checkpointStatsListener Listener for monitoring checkpoint-related events.
+     */
+    public DefaultCheckpointStatsTracker(
+            int numRememberedCheckpoints,
+            JobManagerJobMetricGroup metricGroup,
+            @Nullable CheckpointStatsListener checkpointStatsListener) {
         checkArgument(numRememberedCheckpoints >= 0, "Negative number of remembered checkpoints");
         this.history = new CheckpointStatsHistory(numRememberedCheckpoints);
         this.metricGroup = metricGroup;
+        this.checkpointStatsListener = checkpointStatsListener;
 
         // Latest snapshot is empty
         latestSnapshot =
@@ -203,6 +220,10 @@ public void reportCompletedCheckpoint(CompletedCheckpointStats completed) {
 
             dirty = true;
             logCheckpointStatistics(completed);
+
+            if (checkpointStatsListener != null) {
+                checkpointStatsListener.onCompletedCheckpoint();
+            }
         } finally {
             statsReadWriteLock.unlock();
         }
@@ -217,6 +238,10 @@ public void reportFailedCheckpoint(FailedCheckpointStats failed) {
 
             dirty = true;
             logCheckpointStatistics(failed);
+
+            if (checkpointStatsListener != null) {
+                checkpointStatsListener.onFailedCheckpoint();
+            }
         } finally {
             statsReadWriteLock.unlock();
         }
@@ -256,6 +281,10 @@ public void reportFailedCheckpointsWithoutInProgress() {
             counts.incrementFailedCheckpointsWithoutInProgress();
 
             dirty = true;
+
+            if (checkpointStatsListener != null) {
+                checkpointStatsListener.onFailedCheckpoint();
+            }
         } finally {
             statsReadWriteLock.unlock();
         }