apache · vvcephei · Mar 6, 2020 · Feb 14, 2020 · Feb 15, 2020 · Feb 15, 2020
diff --git a/.../src/main/java/org/apache/kafka/streams/processor/internals/StreamsPartitionAssignor.java b/.../src/main/java/org/apache/kafka/streams/processor/internals/StreamsPartitionAssignor.java
@@ -217,49 +217,17 @@ public List<RebalanceProtocol> supportedProtocols() {
     public ByteBuffer subscriptionUserData(final Set<String> topics) {
         // Adds the following information to subscription
         // 1. Client UUID (a unique id assigned to an instance of KafkaStreams)
-        // 2. Task ids of previously running tasks
-        // 3. Task ids of valid local states on the client's state directory.
-        final Set<TaskId> standbyTasks = taskManager.tasksOnLocalStorage();
-        final Set<TaskId> activeTasks = prepareForSubscription(taskManager,
-            topics,
-            standbyTasks,
-            rebalanceProtocol);
+        // 2. Map from task id to its overall lag
+
+        taskManager.handleRebalanceStart(topics);
+
         return new SubscriptionInfo(
             usedSubscriptionMetadataVersion,
             LATEST_SUPPORTED_VERSION,
             taskManager.processId(),
-            activeTasks,
-            standbyTasks,
-            userEndPoint)
-            .encode();
-    }
-
-    protected static Set<TaskId> prepareForSubscription(final TaskManager taskManager,
-                                                        final Set<String> topics,
-                                                        final Set<TaskId> standbyTasks,
-                                                        final RebalanceProtocol rebalanceProtocol) {
-        // Any tasks that are not yet running are counted as standby tasks for assignment purposes,
-        // along with any old tasks for which we still found state on disk
-        final Set<TaskId> activeTasks;
-
-        switch (rebalanceProtocol) {
-            case EAGER:
-                // In eager, onPartitionsRevoked is called first and we must get the previously saved running task ids
-                activeTasks = taskManager.activeTaskIds();
-                standbyTasks.removeAll(activeTasks);
-                break;
-            case COOPERATIVE:
-                // In cooperative, we will use the encoded ownedPartitions to determine the running tasks
-                activeTasks = Collections.emptySet();
-                standbyTasks.removeAll(taskManager.activeTaskIds());
-                break;
-            default:
-                throw new IllegalStateException("Streams partition assignor's rebalance protocol is unknown");
-        }
-
-        taskManager.handleRebalanceStart(topics);
-
-        return activeTasks;
+            userEndPoint,
+            taskManager.getTaskOffsetSums())
+                .encode();
     }
 
     private Map<String, Assignment> errorAssignment(final Map<UUID, ClientMetadata> clientsMetadata,
@@ -314,7 +282,6 @@ public GroupAssignment assign(final Cluster metadata, final GroupSubscription gr
         // keep track of any future consumers in a "dummy" Client since we can't decipher their subscription
         final UUID futureId = randomUUID();
         final ClientMetadata futureClient = new ClientMetadata(null);
-        clientMetadataMap.put(futureId, futureClient);
 
         int minReceivedMetadataVersion = LATEST_SUPPORTED_VERSION;
         int minSupportedMetadataVersion = LATEST_SUPPORTED_VERSION;
@@ -333,6 +300,9 @@ public GroupAssignment assign(final Cluster metadata, final GroupSubscription gr
             if (usedVersion > LATEST_SUPPORTED_VERSION) {
                 futureMetadataVersion = usedVersion;
                 processId = futureId;
+                if (!clientMetadataMap.containsKey(futureId)) {
+                    clientMetadataMap.put(futureId, futureClient);
+                }
             } else {
                 processId = info.processId();
             }
@@ -345,7 +315,7 @@ public GroupAssignment assign(final Cluster metadata, final GroupSubscription gr
                 clientMetadataMap.put(info.processId(), clientMetadata);
             }
 
-            // add the consumer and any info its its subscription to the client
+            // add the consumer and any info in its subscription to the client
             clientMetadata.addConsumer(consumerId, subscription.ownedPartitions());
             allOwnedPartitions.addAll(subscription.ownedPartitions());
             clientMetadata.addPreviousTasks(info);
@@ -354,7 +324,6 @@ public GroupAssignment assign(final Cluster metadata, final GroupSubscription gr
         final boolean versionProbing;
         if (futureMetadataVersion == UNKNOWN) {
             versionProbing = false;
-            clientMetadataMap.remove(futureId);
         } else if (minReceivedMetadataVersion >= EARLIEST_PROBEABLE_VERSION) {
             versionProbing = true;
             log.info("Received a future (version probing) subscription (version: {})."
@@ -589,12 +558,14 @@ public GroupAssignment assign(final Cluster metadata, final GroupSubscription gr
 
         final Map<UUID, ClientState> states = new HashMap<>();
         for (final Map.Entry<UUID, ClientMetadata> entry : clientMetadataMap.entrySet()) {
+            final UUID uuid = entry.getKey();
             final ClientState state = entry.getValue().state;
-            states.put(entry.getKey(), state);
+            states.put(uuid, state);
 
-            // Either the active tasks (eager) OR the owned partitions (cooperative) were encoded in the subscription
-            // according to the rebalancing protocol, so convert any partitions in a client to tasks where necessary
-            if (!state.ownedPartitions().isEmpty()) {
+            // there are two cases where we need to construct the prevTasks from the ownedPartitions:
+            // 1) COOPERATIVE clients on version 2.4-2.5 do not encode active tasks and rely on ownedPartitions instead
+            // 2) future clientduring version probing: we can't decode the future subscription info's prev tasks
+            if (!state.ownedPartitions().isEmpty() && (uuid == futureId || state.prevActiveTasks().isEmpty())) {
                 final Set<TaskId> previousActiveTasks = new HashSet<>();
                 for (final Map.Entry<TopicPartition, String> partitionEntry : state.ownedPartitions().entrySet()) {
                     final TopicPartition tp = partitionEntry.getKey();
@@ -1154,6 +1125,7 @@ public void onAssignment(final Assignment assignment, final ConsumerGroupMetadat
                 topicToPartitionInfo = getTopicPartitionInfo(partitionsByHost);
                 break;
             case 6:
+            case 7:
                 validateActiveTaskEncoding(partitions, info, logPrefix);
 
                 activeTasks = getActiveTasks(partitions, info);
@@ -1293,7 +1265,8 @@ protected String userEndPoint() {
         return userEndPoint;
     }
 
-    protected TaskManager taskManger() {
+    protected TaskManager taskManager() {
         return taskManager;
     }
+
 }
diff --git a/streams/src/main/java/org/apache/kafka/streams/processor/internals/TaskManager.java b/streams/src/main/java/org/apache/kafka/streams/processor/internals/TaskManager.java
@@ -51,6 +51,7 @@
 
 import static org.apache.kafka.streams.processor.internals.Task.State.CREATED;
 import static org.apache.kafka.streams.processor.internals.Task.State.RESTORING;
+import static org.apache.kafka.streams.processor.internals.Task.State.RUNNING;
 
 public class TaskManager {
     // initialize the task list
@@ -354,11 +355,27 @@ void handleLostAll() {
         }
     }
 
+    /**
+     * @return Map from task id to its total offset summed across all state stores
+     */
+    public Map<TaskId, Long> getTaskOffsetSums() {
+        final Map<TaskId, Long> taskOffsetSums = new HashMap<>();
+
+        for (final TaskId id : tasksOnLocalStorage()) {
+            if (isRunning(id)) {
+                taskOffsetSums.put(id, Task.LATEST_OFFSET);
+            } else {
+                taskOffsetSums.put(id, 0L);
+            }
+        }
+        return taskOffsetSums;
+    }
+
     /**
      * Returns ids of tasks whose states are kept on the local storage. This includes active, standby, and previously
      * assigned but not yet cleaned up tasks
      */
-    public Set<TaskId> tasksOnLocalStorage() {
+    private Set<TaskId> tasksOnLocalStorage() {
         // A client could contain some inactive tasks whose states are still kept on the local storage in the following scenarios:
         // 1) the client is actively maintaining standby tasks by maintaining their states from the change log.
         // 2) the client has just got some tasks migrated out of itself to other clients while these task states
@@ -472,6 +489,11 @@ private Stream<Task> standbyTaskStream() {
         return tasks.values().stream().filter(t -> !t.isActive());
     }
 
+    private boolean isRunning(final TaskId id) {
+        final Task task = tasks.get(id);
+        return task != null && task.isActive() && task.state() == RUNNING;
+    }
+
     /**
      * @throws TaskMigratedException if committing offsets failed (non-EOS)
      *                               or if the task producer got fenced (EOS)

diff --git a/...src/main/java/org/apache/kafka/streams/processor/internals/assignment/AssignmentInfo.java b/...src/main/java/org/apache/kafka/streams/processor/internals/assignment/AssignmentInfo.java
@@ -163,6 +163,7 @@ public ByteBuffer encode() {
                     out.writeInt(errCode);
                     break;
                 case 6:
+                case 7:
                     out.writeInt(usedVersion);
                     out.writeInt(commonlySupportedVersion);
                     encodeActiveAndStandbyTaskAssignment(out);
@@ -327,6 +328,7 @@ public static AssignmentInfo decode(final ByteBuffer data) {
                     assignmentInfo.errCode = in.readInt();
                     break;
                 case 6:
+                case 7:
                     commonlySupportedVersion = in.readInt();
                     assignmentInfo = new AssignmentInfo(usedVersion, commonlySupportedVersion);
                     decodeActiveTasks(assignmentInfo, in);

diff --git a/...pache/kafka/streams/processor/internals/assignment/StreamsAssignmentProtocolVersions.java b/...pache/kafka/streams/processor/internals/assignment/StreamsAssignmentProtocolVersions.java
@@ -19,7 +19,7 @@
 public final class StreamsAssignmentProtocolVersions {
     public static final int UNKNOWN = -1;
     public static final int EARLIEST_PROBEABLE_VERSION = 3;
-    public static final int LATEST_SUPPORTED_VERSION = 6;
+    public static final int LATEST_SUPPORTED_VERSION = 7;
 
     private StreamsAssignmentProtocolVersions() {}
 }
diff --git a/...c/main/java/org/apache/kafka/streams/processor/internals/assignment/SubscriptionInfo.java b/...c/main/java/org/apache/kafka/streams/processor/internals/assignment/SubscriptionInfo.java
@@ -16,11 +16,15 @@
  */
 package org.apache.kafka.streams.processor.internals.assignment;
 
+import java.util.HashSet;
+import java.util.Map;
 import org.apache.kafka.common.protocol.ByteBufferAccessor;
 import org.apache.kafka.common.protocol.ObjectSerializationCache;
 import org.apache.kafka.streams.errors.TaskAssignmentException;
 import org.apache.kafka.streams.internals.generated.SubscriptionInfoData;
+import org.apache.kafka.streams.internals.generated.SubscriptionInfoData.TaskOffsetSum;
 import org.apache.kafka.streams.processor.TaskId;
+import org.apache.kafka.streams.processor.internals.Task;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -41,6 +45,7 @@ public class SubscriptionInfo {
     private final SubscriptionInfoData data;
     private Set<TaskId> prevTasksCache = null;
     private Set<TaskId> standbyTasksCache = null;
+    private Map<TaskId, Long> taskOffsetSumsCache = null;
 
     static {
         // Just statically check to make sure that the generated code always stays in sync with the overall protocol
@@ -69,12 +74,13 @@ private static void validateVersions(final int version, final int latestSupporte
     public SubscriptionInfo(final int version,
                             final int latestSupportedVersion,
                             final UUID processId,
-                            final Set<TaskId> prevTasks,
-                            final Set<TaskId> standbyTasks,
-                            final String userEndPoint) {
+                            final String userEndPoint,
+                            final Map<TaskId, Long> taskOffsetSums) {
         validateVersions(version, latestSupportedVersion);
         final SubscriptionInfoData data = new SubscriptionInfoData();
         data.setVersion(version);
+        data.setProcessId(processId);
+
         if (version >= 2) {
             data.setUserEndPoint(userEndPoint == null
                                      ? new byte[0]
@@ -83,7 +89,38 @@ public SubscriptionInfo(final int version,
         if (version >= 3) {
             data.setLatestSupportedVersion(latestSupportedVersion);
         }
-        data.setProcessId(processId);
+        if (version >= 7) {
+            setTaskOffsetSumDataFromTaskOffsetSumMap(data, taskOffsetSums);
+        } else {
+            setPrevAndStandbySetsFromParsedTaskOffsetSumMap(data, taskOffsetSums);
+        }
+        this.data = data;
+    }
+
+    private static void setTaskOffsetSumDataFromTaskOffsetSumMap(final SubscriptionInfoData data,
+                                                                 final Map<TaskId, Long> taskOffsetSums) {
+        data.setTaskOffsetSums(taskOffsetSums.entrySet().stream().map(t -> {
+            final SubscriptionInfoData.TaskOffsetSum taskOffsetSum = new SubscriptionInfoData.TaskOffsetSum();
+            taskOffsetSum.setTopicGroupId(t.getKey().topicGroupId);
+            taskOffsetSum.setPartition(t.getKey().partition);
+            taskOffsetSum.setOffsetSum(t.getValue());
+            return taskOffsetSum;
+        }).collect(Collectors.toList()));
+    }
+
+    private static void setPrevAndStandbySetsFromParsedTaskOffsetSumMap(final SubscriptionInfoData data,
+                                                                        final Map<TaskId, Long> taskOffsetSums) {
+        final Set<TaskId> prevTasks = new HashSet<>();
+        final Set<TaskId> standbyTasks = new HashSet<>();
+
+        for (final Map.Entry<TaskId, Long> taskOffsetSum : taskOffsetSums.entrySet()) {
+            if (taskOffsetSum.getValue() == Task.LATEST_OFFSET) {
+                prevTasks.add(taskOffsetSum.getKey());
+            } else {
+                standbyTasks.add(taskOffsetSum.getKey());
+            }
+        }
+
         data.setPrevTasks(prevTasks.stream().map(t -> {
             final SubscriptionInfoData.TaskId taskId = new SubscriptionInfoData.TaskId();
             taskId.setTopicGroupId(t.topicGroupId);
@@ -96,8 +133,6 @@ public SubscriptionInfo(final int version,
             taskId.setPartition(t.partition);
             return taskId;
         }).collect(Collectors.toList()));
-
-        this.data = data;
     }
 
     private SubscriptionInfo(final SubscriptionInfoData subscriptionInfoData) {
@@ -119,6 +154,10 @@ public UUID processId() {
 
     public Set<TaskId> prevTasks() {
         if (prevTasksCache == null) {
+            // lazily initialize the prev and standby task maps as they may not be needed
+            if (data.version() >= 7) {
+                setPrevAndStandbySetsFromParsedTaskOffsetSumMap(data, taskOffsetSums());
+            }
             prevTasksCache = Collections.unmodifiableSet(
                 data.prevTasks()
                     .stream()
@@ -131,6 +170,10 @@ public Set<TaskId> prevTasks() {
 
     public Set<TaskId> standbyTasks() {
         if (standbyTasksCache == null) {
+            // lazily initialize the prev and standby task maps as they may not be needed
+            if (data.version() >= 7) {
+                setPrevAndStandbySetsFromParsedTaskOffsetSumMap(data, taskOffsetSums());
+            }
             standbyTasksCache = Collections.unmodifiableSet(
                 data.standbyTasks()
                     .stream()
@@ -141,12 +184,39 @@ public Set<TaskId> standbyTasks() {
         return standbyTasksCache;
     }
 
+    public Map<TaskId, Long> taskOffsetSums() {
+        if (taskOffsetSumsCache == null) {
+            taskOffsetSumsCache = Collections.unmodifiableMap(
+                data.taskOffsetSums()
+                    .stream()
+                    .collect(Collectors.toMap(t -> new TaskId(t.topicGroupId(), t.partition()), TaskOffsetSum::offsetSum))
+            );
+        }
+        return taskOffsetSumsCache;
+    }
+
     public String userEndPoint() {
         return data.userEndPoint() == null || data.userEndPoint().length == 0
             ? null
             : new String(data.userEndPoint(), StandardCharsets.UTF_8);
     }
 
+    public static Set<TaskId> getActiveTasksFromTaskOffsetSumMap(final Map<TaskId, Long> taskOffsetSums) {
+        return taskOffsetSumMapToTaskSet(taskOffsetSums, true);
+    }
+
+    public static Set<TaskId> getStandbyTasksFromTaskOffsetSumMap(final Map<TaskId, Long> taskOffsetSums) {
+        return taskOffsetSumMapToTaskSet(taskOffsetSums, false);
+    }
+
+    private static Set<TaskId> taskOffsetSumMapToTaskSet(final Map<TaskId, Long> taskOffsetSums,
+                                                         final boolean getActiveTasks) {
+        return taskOffsetSums.entrySet().stream()
+                   .filter(t -> getActiveTasks == (t.getValue() == Task.LATEST_OFFSET))
+                   .map(Map.Entry::getKey)
+                   .collect(Collectors.toSet());
+    }
+
     /**
      * @throws TaskAssignmentException if method fails to encode the data
      */