From 3ecaff277a88ecf1e9bbca9a493e3bcdfb20de6c Mon Sep 17 00:00:00 2001
From: Travis Bischel <travis.bischel+github@gmail.com>
Date: Mon, 31 Oct 2022 11:10:20 -0600
Subject: [PATCH] kgo txn: handle UNKNOWN_SERVER_ERROR more widely

This error is technically not retryable, but in the context of
transactions, at worst, we will eventually see a better more direct
error.

Redpanda returns this error a bit right now (although this is being
reduced with 22.3), but we may as well be more resilient anyway.
---
 pkg/kgo/txn.go | 77 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 15 deletions(-)

diff --git a/pkg/kgo/txn.go b/pkg/kgo/txn.go
index 48f9d692..249af071 100644
--- a/pkg/kgo/txn.go
+++ b/pkg/kgo/txn.go
@@ -261,13 +261,29 @@ func (s *GroupTransactSession) End(ctx context.Context, commit TransactionEndTry
 	kip447 := false
 	if wantCommit && !failed {
 		isAbortableCommitErr := func(err error) bool {
+			// ILLEGAL_GENERATION: rebalance began and completed
+			// before we committed.
+			//
+			// REBALANCE_IN_PREGRESS: rebalance began, abort.
+			//
+			// COORDINATOR_NOT_AVAILABLE,
+			// COORDINATOR_LOAD_IN_PROGRESS,
+			// NOT_COORDINATOR: request failed too many times
+			//
+			// CONCURRENT_TRANSACTIONS: Kafka not harmonized,
+			// we can just abort.
+			//
+			// UNKNOWN_SERVER_ERROR: technically should not happen,
+			// but we can just abort. Redpanda returns this in
+			// certain versions.
 			switch {
-			case errors.Is(err, kerr.IllegalGeneration), // rebalance begun & completed before we committed
-				errors.Is(err, kerr.RebalanceInProgress),     // in rebalance, abort & retry later
-				errors.Is(err, kerr.CoordinatorNotAvailable), // req failed too many times (same for next two)
+			case errors.Is(err, kerr.IllegalGeneration),
+				errors.Is(err, kerr.RebalanceInProgress),
+				errors.Is(err, kerr.CoordinatorNotAvailable),
 				errors.Is(err, kerr.CoordinatorLoadInProgress),
 				errors.Is(err, kerr.NotCoordinator),
-				errors.Is(err, kerr.ConcurrentTransactions): // kafka not harmonized, we can just abort or retry / eventually abort
+				errors.Is(err, kerr.ConcurrentTransactions),
+				errors.Is(err, kerr.UnknownServerError):
 				return true
 			}
 			return false
@@ -376,14 +392,34 @@ func (s *GroupTransactSession) End(ctx context.Context, commit TransactionEndTry
 		"will_try_commit", willTryCommit,
 	)
 
-	retried := false // just in case, we use this to avoid looping
-retryUnattempted:
+	// We have a few potential retryable errors from EndTransaction.
+	// OperationNotAttempted will be returned at most once.
+	//
+	// UnknownServerError should not be returned, but some brokers do:
+	// technically this is fatal, but there is no downside to retrying
+	// (even retrying a commit) and seeing if we are successful or if we
+	// get a better error.
+	var tries int
+retry:
 	endTxnErr := s.cl.EndTransaction(ctx, TransactionEndTry(willTryCommit))
-	if errors.Is(endTxnErr, kerr.OperationNotAttempted) && !retried {
-		willTryCommit = false
-		retried = true
-		s.cl.cfg.logger.Log(LogLevelInfo, "end transaction with commit not attempted; retrying as abort")
-		goto retryUnattempted
+	tries++
+	if endTxnErr != nil && tries < 10 {
+		switch {
+		case errors.Is(endTxnErr, kerr.OperationNotAttempted):
+			s.cl.cfg.logger.Log(LogLevelInfo, "end transaction with commit not attempted; retrying as abort")
+			willTryCommit = false
+			goto retry
+
+		case errors.Is(endTxnErr, kerr.UnknownServerError):
+			s.cl.cfg.logger.Log(LogLevelInfo, "end transaction with commit unknown server error; retrying")
+			after := time.NewTimer(s.cl.cfg.retryBackoff(tries))
+			select {
+			case <-after.C: // context canceled; we will see when we retry
+			case <-s.cl.ctx.Done():
+				after.Stop()
+			}
+			goto retry
+		}
 	}
 
 	if !willTryCommit || endTxnErr != nil {
@@ -856,10 +892,15 @@ func (cl *Client) EndTransaction(ctx context.Context, commit TransactionEndTry)
 	})
 
 	// If the returned error is still a Kafka error, this is fatal and we
-	// need to fail our producer ID we loaded above. ConcurrentTransactions
-	// could be
+	// need to fail our producer ID we loaded above.
+	//
+	// UNKNOWN_SERVER_ERROR can theoretically be returned (not all brokers
+	// do). This technically is fatal, but we do not really know whether it
+	// is. We can just return this error and let the caller decide to
+	// continue, if the caller does continue, we will try something and
+	// eventually then receive our proper transactional error, if any.
 	var ke *kerr.Error
-	if errors.As(err, &ke) && !ke.Retriable {
+	if errors.As(err, &ke) && !ke.Retriable && ke.Code != kerr.UnknownServerError.Code {
 		cl.failProducerID(id, epoch, err)
 	}
 
@@ -1054,8 +1095,14 @@ func (cl *Client) addOffsetsToTxn(ctx context.Context, group string) error {
 
 	// If the returned error is still a Kafka error, this is fatal and we
 	// need to fail our producer ID we created just above.
+	//
+	// We special case UNKNOWN_SERVER_ERROR, because we do not really know
+	// if this is fatal. If it is, we will catch it later on a better
+	// error. Some brokers send this when things fail internally, we can
+	// just abort our commit and see if things are still bad in
+	// EndTransaction.
 	var ke *kerr.Error
-	if errors.As(err, &ke) && !ke.Retriable {
+	if errors.As(err, &ke) && !ke.Retriable && ke.Code != kerr.UnknownServerError.Code {
 		cl.failProducerID(id, epoch, err)
 	}