KAFKA-19767 - Send Share-Fetch one-node at a time for record_limit mode (#20855)

ShivsundarR · web-flow · commit 108ad6eeeb99 · 2025-11-18T15:20:49.000Z
*What*

- After KIP-1206, when `record_limit` mode was introduced, we ideally
want no more than the #records in the `maxRecords` field in
`ShareFetchRequest`.
- Currently, the client broadcasts the share fetch requests to all nodes
which host the leaders of the partitions that it is subscribed to.
- The application thread would be woken up after the first response
arrives, but meanwhile the responses from other nodes could bring in
those many #records next and would wait in the buffer, that would mean
we are wasting the acquisition locks for these records which are
waiting.
- Instead we would want to only send the next request when we poll
again.

- PR aims to send the request to only 1 node at a time in record_limit
mode.
- We are using partition-rotation on each poll so that no partition is
starved.

There were NCSS checkstyle errors in `ShareConsumeRequestManagerTest`,
so added a few refactors there to reduce the length.

Performance
- When we have more consumers than the #partitions(i.e when real sharing
of data happens in a partition), then we are seeing the performance is
almost the same as the current approach. But when we have lesser
consumers than the #partitions, then we see a performance regression as
client is waiting for a node to return a response before it can send the
next request.
- Hence we have introduced this only for `record_limit` mode for now,
future work will be done to improve this area.

Reviewers: Andrew Schofield &lt;aschofield@confluent.io&gt;
diff --git a/clients/src/main/java/org/apache/kafka/clients/consumer/internals/ShareConsumeRequestManager.java b/clients/src/main/java/org/apache/kafka/clients/consumer/internals/ShareConsumeRequestManager.java
@@ -18,6 +18,7 @@
 
 import org.apache.kafka.clients.ClientResponse;
 import org.apache.kafka.clients.Metadata;
+import org.apache.kafka.clients.consumer.ShareAcquireMode;
 import org.apache.kafka.clients.consumer.internals.NetworkClientDelegate.PollResult;
 import org.apache.kafka.clients.consumer.internals.NetworkClientDelegate.UnsentRequest;
 import org.apache.kafka.clients.consumer.internals.events.ShareAcknowledgementEvent;
@@ -89,6 +90,7 @@ public class ShareConsumeRequestManager implements RequestManager, MemberStateLi
     private final IdempotentCloser idempotentCloser = new IdempotentCloser();
     private Uuid memberId;
     private boolean fetchMoreRecords = false;
+    private final AtomicInteger fetchRecordsNodeId = new AtomicInteger(-1);
     private final Map<Integer, Map<TopicIdPartition, Acknowledgements>> fetchAcknowledgementsToSend;
     private final Map<Integer, Map<TopicIdPartition, Acknowledgements>> fetchAcknowledgementsInFlight;
     private final Map<Integer, Tuple<AcknowledgeRequestState>> acknowledgeRequestStates;
@@ -196,6 +198,13 @@ public PollResult poll(long currentTimeMs) {
                 }
                 topicNamesMap.putIfAbsent(new IdAndPartition(tip.topicId(), tip.partition()), tip.topic());
 
+                // If we have not chosen a node for fetching records yet,
+                // choose now, and rotate the assigned partitions so the next poll starts on a different partition.
+                // This is only applicable for record_limit mode.
+                if (isShareAcquireModeRecordLimit() && fetchRecordsNodeId.compareAndSet(-1, node.id())) {
+                    subscriptions.movePartitionToEnd(partition);
+                }
+
                 log.debug("Added fetch request for partition {} to node {}", tip, node.id());
             }
         }
@@ -245,6 +254,21 @@ public PollResult poll(long currentTimeMs) {
             log.trace("Building ShareFetch request to send to node {}", target.id());
             ShareFetchRequest.Builder requestBuilder = handler.newShareFetchBuilder(groupId, shareFetchConfig);
 
+            // For record_limit mode, we only send a full ShareFetch to a single node at a time.
+            // We prepare to build ShareFetch requests for all nodes with session handlers to permit
+            // piggy-backing of acknowledgements, and also to adjust the topic-partitions
+            // in the share session.
+            if (isShareAcquireModeRecordLimit() && target.id() != fetchRecordsNodeId.get()) {
+                ShareFetchRequestData data = requestBuilder.data();
+                // If there's nothing to send, just skip building the record.
+                if (data.topics().isEmpty() && data.forgottenTopicsData().isEmpty()) {
+                    return null;
+                } else {
+                    // There is something to send, but we don't want to fetch any records.
+                    requestBuilder.data().setMaxRecords(0);
+                }
+            }
+
             nodesWithPendingRequests.add(target.id());
 
             BiConsumer<ClientResponse, Throwable> responseHandler = (clientResponse, error) -> {
@@ -255,11 +279,15 @@ public PollResult poll(long currentTimeMs) {
                 }
             };
             return new UnsentRequest(requestBuilder, Optional.of(target)).whenComplete(responseHandler);
-        }).collect(Collectors.toList());
+        }).filter(Objects::nonNull).collect(Collectors.toList());
 
         return new PollResult(requests);
     }
 
+    private boolean isShareAcquireModeRecordLimit() {
+        return shareFetchConfig.shareAcquireMode == ShareAcquireMode.RECORD_LIMIT;
+    }
+
     /**
      * Add acknowledgements for a topic-partition to the node's in-flight acknowledgements.
      *
@@ -738,6 +766,15 @@ private boolean isLeaderKnownToHaveChanged(int nodeId, TopicIdPartition topicIdP
         return false;
     }
 
+    @Override
+    public long maximumTimeToWait(long currentTimeMs) {
+        // When fetching records and there is no chosen node for fetching, we do not want to wait for the next poll in record_limit mode.
+        if (isShareAcquireModeRecordLimit() && fetchMoreRecords && subscriptions.numAssignedPartitions() > 0 && fetchRecordsNodeId.get() == -1) {
+            return 0L;
+        }
+        return Long.MAX_VALUE;
+    }
+
     private void handleShareFetchSuccess(Node fetchTarget,
                                          ShareFetchRequestData requestData,
                                          ClientResponse resp) {
@@ -858,6 +895,9 @@ private void handleShareFetchSuccess(Node fetchTarget,
             metricsManager.recordLatency(resp.destination(), resp.requestLatencyMs());
         } finally {
             log.debug("Removing pending request for node {} - success", fetchTarget.id());
+            if (isShareAcquireModeRecordLimit()) {
+                fetchRecordsNodeId.compareAndSet(fetchTarget.id(), -1);
+            }
             nodesWithPendingRequests.remove(fetchTarget.id());
         }
     }
@@ -896,6 +936,9 @@ private void handleShareFetchFailure(Node fetchTarget,
             }));
         } finally {
             log.debug("Removing pending request for node {} - failed", fetchTarget.id());
+            if (isShareAcquireModeRecordLimit()) {
+                fetchRecordsNodeId.compareAndSet(fetchTarget.id(), -1);
+            }
             nodesWithPendingRequests.remove(fetchTarget.id());
         }
     }
diff --git a/clients/src/main/java/org/apache/kafka/clients/consumer/internals/SubscriptionState.java b/clients/src/main/java/org/apache/kafka/clients/consumer/internals/SubscriptionState.java
@@ -39,6 +39,7 @@
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
@@ -315,7 +316,7 @@ public synchronized void assignFromSubscribed(Collection<TopicPartition> assignm
         if (!this.hasAutoAssignedPartitions())
             throw new IllegalArgumentException("Attempt to dynamically assign partitions while manual assignment in use");
 
-        Map<TopicPartition, TopicPartitionState> assignedPartitionStates = new HashMap<>(assignments.size());
+        Map<TopicPartition, TopicPartitionState> assignedPartitionStates = new LinkedHashMap<>(assignments.size());
         for (TopicPartition tp : assignments) {
             TopicPartitionState state = this.assignment.stateValue(tp);
             if (state == null)
diff --git a/clients/src/test/java/org/apache/kafka/clients/consumer/internals/ShareConsumeRequestManagerTest.java b/clients/src/test/java/org/apache/kafka/clients/consumer/internals/ShareConsumeRequestManagerTest.java