sentry: txn_coord_sender.go:823: log.Fatal: transaction unexpectedly committed #67765

cockroach-teamcity · 2021-07-19T18:49:20Z

This issue was autofiled by Sentry. It represents a crash or reported error on a live cluster with telemetry enabled.

Sentry link: https://sentry.io/organizations/cockroach-labs/issues/2520969052/?referrer=webhooks_plugin

Panic message:

txn_coord_sender.go:823: log.Fatal: transaction unexpectedly committed: ×. ba: ×. txn: meta={id=8fadd759 pri=0.00567380 epo=0 ts=1626720528.844090387,0 min=1626720528.844090387,0 seq=8} lock=true stat=COMMITTED rts=0,0 wto=false max=0,0 int=7.
--
*errutil.leafError: log.Fatal: transaction unexpectedly committed: ×. ba: ×. txn: meta={id=8fadd759 pri=0.00567380 epo=0 ts=1626720528.844090387,0 min=1626720528.844090387,0 seq=8} lock=true stat=COMMITTED rts=0,0 wto=false max=0,0 int=7. (1)
txn_coord_sender.go:823: *withstack.withStack (top exception)
(check the extra data payloads)

Stacktrace (expand for inline code snippets):

cockroach/pkg/kv/kvclient/kvcoord/txn_coord_sender.go

Lines 822 to 824 in 628fb8b

    
           	// committed. 
        
           	log.Fatalf(ctx, "transaction unexpectedly committed: %s. ba: %s. txn: %s.", pErr, ba, errTxn) 
        
           }

in pkg/kv/kvclient/kvcoord.sanityCheckCommittedErr

cockroach/pkg/kv/kvclient/kvcoord/txn_coord_sender.go

Lines 795 to 797 in 628fb8b

    
           if errTxn.Status == roachpb.COMMITTED { 
        
           	sanityCheckCommittedErr(ctx, pErr, ba) 
        
           }

in pkg/kv/kvclient/kvcoord.(*TxnCoordSender).updateStateLocked

cockroach/pkg/kv/kvclient/kvcoord/txn_coord_sender.go

Lines 501 to 503 in 628fb8b


	pErr = tc.updateStateLocked(ctx, ba, br, pErr)

in pkg/kv/kvclient/kvcoord.(*TxnCoordSender).Send

cockroach/pkg/kv/db.go

Lines 743 to 745 in 628fb8b

    
           tracing.AnnotateTrace() 
        
           br, pErr := sender.Send(ctx, ba) 
        
           if pErr != nil {

in pkg/kv.(*DB).sendUsingSender

cockroach/pkg/kv/txn.go

Lines 918 to 920 in 628fb8b

    
           txn.mu.Unlock() 
        
           br, pErr := txn.db.sendUsingSender(ctx, ba, sender) 
        
           if pErr == nil {

in pkg/kv.(*Txn).Send

cockroach/pkg/kv/db.go

Lines 653 to 655 in 628fb8b

    
           ba.Header = b.Header 
        
           b.response, b.pErr = send(ctx, ba) 
        
           b.fillResults(ctx)

in pkg/kv.sendAndFill

cockroach/pkg/kv/txn.go

Lines 589 to 591 in 628fb8b

    
           	} 
        
           	return sendAndFill(ctx, txn.Send, b) 
        
           }

in pkg/kv.(*Txn).Run

cockroach/pkg/kv/txn.go

Lines 651 to 653 in 628fb8b

    
           	b.initResult(1 /* calls */, 0, b.raw, nil) 
        
           	return txn.Run(ctx, b) 
        
           }

in pkg/kv.(*Txn).CommitInBatch

cockroach/pkg/sql/tablewriter.go

Lines 148 to 150 in 628fb8b

    
           	// coordinator. 
        
           	err = tb.txn.CommitInBatch(ctx, tb.b) 
        
           } else {

in pkg/sql.(*tableWriterBase).finalize

cockroach/pkg/sql/insert.go

Lines 258 to 260 in 628fb8b

    
           if lastBatch { 
        
           	if err := n.run.ti.finalize(params.ctx); err != nil { 
        
           		return false, err

in pkg/sql.(*insertNode).BatchedNext

cockroach/pkg/sql/plan_batch.go

Lines 111 to 113 in 628fb8b

    
           // First batch, or finished previous batch; advance one. 
        
           if next, err := s.source.BatchedNext(params); !next { 
        
           	return false, err

in pkg/sql.(*serializeNode).Next

cockroach/pkg/sql/plan_node_to_row_source.go

Lines 175 to 177 in 628fb8b

    
           for p.State == execinfra.StateRunning { 
        
           	valid, err := p.node.Next(p.params) 
        
           	if err != nil || !valid {

in pkg/sql.(*planNodeToRowSource).Next

cockroach/pkg/sql/execinfra/base.go

Lines 169 to 171 in 628fb8b

    
           for { 
        
           	row, meta := src.Next() 
        
           	// Emit the row; stop if no more rows are needed.

in pkg/sql/execinfra.Run

cockroach/pkg/sql/execinfra/processorsbase.go

Lines 775 to 777 in 628fb8b

    
           	ctx = pb.self.Start(ctx) 
        
           	Run(ctx, pb.self, pb.Out.output) 
        
           }

in pkg/sql/execinfra.(*ProcessorBase).Run

cockroach/pkg/sql/flowinfra/flow.go

Lines 392 to 394 in 628fb8b

    
           log.VEventf(ctx, 1, "running %T in the flow's goroutine", headProc) 
        
           headProc.Run(ctx) 
        
           return nil

in pkg/sql/flowinfra.(*FlowBase).Run

cockroach/pkg/sql/distsql_running.go

Lines 416 to 418 in 628fb8b

    
           // TODO(radu): this should go through the flow scheduler. 
        
           if err := flow.Run(ctx, func() {}); err != nil { 
        
           	log.Fatalf(ctx, "unexpected error from syncFlow.Start(): %v\n"+

in pkg/sql.(*DistSQLPlanner).Run

cockroach/pkg/sql/distsql_running.go

Lines 996 to 998 in 628fb8b

    
           	recv.expectedRowsRead = int64(physPlan.TotalEstimatedScannedRows) 
        
           	return dsp.Run(planCtx, txn, physPlan, recv, evalCtx, nil /* finishedSetupFn */) 
        
           }

in pkg/sql.(*DistSQLPlanner).PlanAndRun

cockroach/pkg/sql/conn_executor_exec.go

Lines 1000 to 1002 in 628fb8b

    
           // the planner whether or not to plan remote table readers. 
        
           cleanup := ex.server.cfg.DistSQLPlanner.PlanAndRun( 
        
           	ctx, evalCtx, planCtx, planner.txn, planner.curPlan.main, recv,

in pkg/sql.(*connExecutor).execWithDistSQLEngine

cockroach/pkg/sql/conn_executor_exec.go

Lines 871 to 873 in 628fb8b

    
           ex.sessionTracing.TraceExecStart(ctx, "distributed") 
        
           stats, err := ex.execWithDistSQLEngine( 
        
           	ctx, planner, stmt.AST.StatementType(), res, distributePlan.WillDistribute(), progAtomic,

in pkg/sql.(*connExecutor).dispatchToExecutionEngine

cockroach/pkg/sql/conn_executor_exec.go

Lines 638 to 640 in 628fb8b

    
           p.autoCommit = os.ImplicitTxn.Get() && !ex.server.cfg.TestingKnobs.DisableAutoCommit 
        
           if err := ex.dispatchToExecutionEngine(ctx, p, res); err != nil { 
        
           	return nil, nil, err

in pkg/sql.(*connExecutor).execStmtInOpenState

cockroach/pkg/sql/conn_executor_exec.go

Lines 113 to 115 in 628fb8b

    
           } else { 
        
           	ev, payload, err = ex.execStmtInOpenState(ctx, stmt, res, pinfo) 
        
           }

in pkg/sql.(*connExecutor).execStmt

cockroach/pkg/sql/conn_executor_exec.go

Lines 202 to 204 in 628fb8b

    
           if !portal.exhausted { 
        
           	ev, payload, err = ex.execStmt(stmtCtx, curStmt, stmtRes, pinfo) 
        
           	// Portal suspension is supported via a "side" state machine

in pkg/sql.(*connExecutor).execPortal

cockroach/pkg/sql/conn_executor.go

Lines 1532 to 1534 in 628fb8b

    
           res = stmtRes 
        
           ev, payload, err = ex.execPortal(ctx, portal, portalName, stmtRes, pinfo) 
        
           return err

in pkg/sql.(*connExecutor).execCmd.func2

cockroach/pkg/sql/conn_executor.go

Lines 1534 to 1536 in 628fb8b

    
           	return err 
        
           }() 
        
           // Note: we write to ex.statsCollector.phaseTimes, instead of ex.phaseTimes,

in pkg/sql.(*connExecutor).execCmd

cockroach/pkg/sql/conn_executor.go

Lines 1390 to 1392 in 628fb8b

    
           var err error 
        
           if err = ex.execCmd(ex.Ctx()); err != nil { 
        
           	if errors.IsAny(err, io.EOF, errDrainingComplete) {

in pkg/sql.(*connExecutor).run

cockroach/pkg/sql/conn_executor.go

Lines 507 to 509 in 628fb8b

    
           	}() 
        
           	return h.ex.run(ctx, s.pool, reserved, cancel) 
        
           }

in pkg/sql.(*Server).ServeConn

cockroach/pkg/sql/pgwire/conn.go

Lines 625 to 627 in 628fb8b

    
           	reservedOwned = false // We're about to pass ownership away. 
        
           	retErr = sqlServer.ServeConn(ctx, connHandler, reserved, cancelConn) 
        
           }()

in pkg/sql/pgwire.(*conn).processCommandsAsync.func1
/usr/local/go/src/runtime/asm_amd64.s#L1356-L1358 in runtime.goexit

pkg/kv/kvclient/kvcoord/txn_coord_sender.go in pkg/kv/kvclient/kvcoord.sanityCheckCommittedErr at line 823
pkg/kv/kvclient/kvcoord/txn_coord_sender.go in pkg/kv/kvclient/kvcoord.(*TxnCoordSender).updateStateLocked at line 796
pkg/kv/kvclient/kvcoord/txn_coord_sender.go in pkg/kv/kvclient/kvcoord.(*TxnCoordSender).Send at line 502
pkg/kv/db.go in pkg/kv.(*DB).sendUsingSender at line 744
pkg/kv/txn.go in pkg/kv.(*Txn).Send at line 919
pkg/kv/db.go in pkg/kv.sendAndFill at line 654
pkg/kv/txn.go in pkg/kv.(*Txn).Run at line 590
pkg/kv/txn.go in pkg/kv.(*Txn).CommitInBatch at line 652
pkg/sql/tablewriter.go in pkg/sql.(*tableWriterBase).finalize at line 149
pkg/sql/insert.go in pkg/sql.(*insertNode).BatchedNext at line 259
pkg/sql/plan_batch.go in pkg/sql.(*serializeNode).Next at line 112
pkg/sql/plan_node_to_row_source.go in pkg/sql.(*planNodeToRowSource).Next at line 176
pkg/sql/execinfra/base.go in pkg/sql/execinfra.Run at line 170
pkg/sql/execinfra/processorsbase.go in pkg/sql/execinfra.(*ProcessorBase).Run at line 776
pkg/sql/flowinfra/flow.go in pkg/sql/flowinfra.(*FlowBase).Run at line 393
pkg/sql/distsql_running.go in pkg/sql.(*DistSQLPlanner).Run at line 417
pkg/sql/distsql_running.go in pkg/sql.(*DistSQLPlanner).PlanAndRun at line 997
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).execWithDistSQLEngine at line 1001
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).dispatchToExecutionEngine at line 872
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).execStmtInOpenState at line 639
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).execStmt at line 114
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).execPortal at line 203
pkg/sql/conn_executor.go in pkg/sql.(*connExecutor).execCmd.func2 at line 1533
pkg/sql/conn_executor.go in pkg/sql.(*connExecutor).execCmd at line 1535
pkg/sql/conn_executor.go in pkg/sql.(*connExecutor).run at line 1391
pkg/sql/conn_executor.go in pkg/sql.(*Server).ServeConn at line 508
pkg/sql/pgwire/conn.go in pkg/sql/pgwire.(*conn).processCommandsAsync.func1 at line 626
/usr/local/go/src/runtime/asm_amd64.s in runtime.goexit at line 1357

Tag	Value
Cockroach Release	`v20.2.7`
Cockroach SHA:	`628fb8b`
Platform	linux amd64
Distribution	CCL
Environment	v20.2.7
Command	server
Go Version	``
# of CPUs
# of Goroutines

Jira issue: CRDB-9268

gz#15309

The text was updated successfully, but these errors were encountered:

yuzefovich · 2021-07-21T00:01:50Z

dup of #57552

knz · 2021-08-12T10:52:12Z

Reusing this as most recent instance in Sentry

mwang1026 · 2021-08-20T19:05:29Z

@nvanbenschoten is this actionable? otherwise we should close

tbg · 2021-12-08T10:36:30Z

This has come up in a customer incident, see https://github.com/cockroachlabs/support/issues/1315.

We are currently investigating how we are hitting this assertion, but while it fires it is doing more harm than good. See cockroachdb#67765. The provided test exercises the case in which the error is returned (i.e. opted out of the assertion). Touches cockroachlabs/support#1315. Release note: None

73603: kvcoord: make "txn unexpectedly committed" assertion opt-out r=nvanbenschoten a=tbg We are currently investigating how we are hitting this assertion, but while it fires it is doing more harm than good. See #67765. The provided test exercises the case in which the error is returned (i.e. opted out of the assertion). Touches https://github.com/cockroachlabs/support/issues/1315. Release note: None Co-authored-by: Tobias Grieger <[email protected]>

We are currently investigating how we are hitting this assertion, but while it fires it is doing more harm than good. See cockroachdb#67765. The provided test exercises the case in which the error is returned (i.e. opted out of the assertion). Touches cockroachlabs/support#1315. Release note: None

tbg · 2022-01-06T11:16:52Z

The assertion is now opt-out. Unfortunately, at present we have only an unsubstantiated theory on what could be causing these crashes, namely that DistSender retries a commit attempt that was already successful (it should be returning an AmbiguousResultError in this case). We suspected that possibly

cockroach/pkg/util/grpcutil/grpc_util.go

Lines 141 to 146 in 3fc6cc5

    
           if errors.HasType(err, connectionNotReadyError{}) || 
        
           	errors.HasType(err, (*netutil.InitialHeartbeatFailedError)(nil)) || 
        
           	errors.Is(err, circuit.ErrBreakerOpen) || 
        
           	IsConnectionRejected(err) { 
        
           	return true 
        
           }

has false positives (i.e. returns true when it shouldn't) but we were unable to corroborate this. I'm going to be putting this on hold in the hope that we will get more information from the next occurrence.

While previously, RPC failures were assumed to be retriable, as write operations (with the notable exception of `EndTxn`) were assumed to be idempotent, it has been seen in cockroachdb#67765 and documented in cockroachdb#103817 that RPC failures on write operations that occur in parallel with a commit (i.e. a partial batch where `withCommit==true`), it is not always possible to assume idempotency and retry the "ambiguous" writes. This is due to the fact that the retried write RPC could result in the transaction's `WriteTimestamp` being bumped, changing the commit timestamp of the transaction that may in fact already be implicitly committed if the initial "ambiguous" write actually succeeded. This change modifies the protocol of the DistSender to flag in subsequent retries that a batch with a commit has previously experienced ambiguity, as well as the handling of the retried write in the MVCC layer to detect this previous ambiguity and reject retries that change the write timestamp as a non-idempotent replay. The flag allows subsequent retries to "remember" the earlier ambiguous write and evaluate accordingly. The flag allows us to properly handle RPC failures (i.e. ambiguous writes) that occur on commit, as a transaction that is implicitly committed is eligible to be marked as explicitly committed by contending transactions via the `RecoverTxn` operation, resulting in a race between retries by the transaction coordinator and recovery by contending transactions that could result in either incorrectly reporting a transaction as having failed with a `RETRY_SERIALIZABLE` error (despite the possibility that it already was or could be recovered and successfully committed), or in attempting to explicitly commit an already-recovered and committed transaction, resulting in seeing an assertion failure due to `transaction unexpectedly committed`. The replay protection introduced here allows us to avoid both of these situations by detecting a replay that should be considered non-idempotent and returning an error, causing the original RPC error remembered by the DistSender to be propagated as an `AmbiguousResultError`. As such, this can be handled by application code by validating the success/failure of a transaction when receiving this error. Depends on cockroachdb#107680, cockroachdb#107323, cockroachdb#108154, cockroachdb#108001 Fixes: cockroachdb#103817 Release note (bug fix): Properly handles RPC failures on writes using the parallel commit protocol that execute in parallel to the commit operation, avoiding incorrect retriable failures and `transaction unexpectedly committed` assertions by detecting when writes cannot be retried idempotently, instead returning an `AmbiguousResultError`.

While previously, RPC failures were assumed to be retryable, as write operations (with the notable exception of `EndTxn`) were assumed to be idempotent, it has been seen in cockroachdb#67765 and documented in cockroachdb#103817 that RPC failures on write operations that occur in parallel with a commit (i.e. a partial batch where `withCommit==true`), it is not always possible to assume idempotency and retry the "ambiguous" writes. This is due to the fact that the retried write RPC could result in the transaction's `WriteTimestamp` being bumped, changing the commit timestamp of the transaction that may in fact already be implicitly committed if the initial "ambiguous" write actually succeeded. This change modifies the protocol of the DistSender to flag in subsequent retries that a batch with a commit has previously experienced ambiguity, as well as the handling of the retried write in the MVCC layer to detect this previous ambiguity and reject retries that change the write timestamp as a non-idempotent replay. The flag allows subsequent retries to "remember" the earlier ambiguous write and evaluate accordingly. The flag allows us to properly handle RPC failures (i.e. ambiguous writes) that occur on commit, as a transaction that is implicitly committed is eligible to be marked as explicitly committed by contending transactions via the `RecoverTxn` operation, resulting in a race between retries by the transaction coordinator and recovery by contending transactions that could result in either incorrectly reporting a transaction as having failed with a `RETRY_SERIALIZABLE` error (despite the possibility that it already was or could be recovered and successfully committed), or in attempting to explicitly commit an already-recovered and committed transaction, resulting in seeing an assertion failure due to `transaction unexpectedly committed`. The replay protection introduced here allows us to avoid both of these situations by detecting a replay that should be considered non-idempotent and returning an error, causing the original RPC error remembered by the DistSender to be propagated as an `AmbiguousResultError`. As such, this can be handled by application code by validating the success/failure of a transaction when receiving this error. Depends on cockroachdb#107680, cockroachdb#107323, cockroachdb#108154, cockroachdb#108001 Fixes: cockroachdb#103817 Release note (bug fix): Properly handles RPC failures on writes using the parallel commit protocol that execute in parallel to the commit operation, avoiding incorrect retryable failures and `transaction unexpectedly committed` assertions by detecting when writes cannot be retried idempotently, instead returning an `AmbiguousResultError`.

107658: kv: enable replay protection for ambiguous writes on commits r=AlexTalks a=AlexTalks While previously, RPC failures were assumed to be retriable, as write operations (with the notable exception of `EndTxn`) were assumed to be idempotent, it has been seen in #67765 and documented in #103817 that RPC failures on write operations that occur in parallel with a commit (i.e. a partial batch where `withCommit==true`), it is not always possible to assume idempotency and retry the "ambiguous" writes. This is due to the fact that the retried write RPC could result in the transaction's `WriteTimestamp` being bumped, changing the commit timestamp of the transaction that may in fact already be implicitly committed if the initial "ambiguous" write actually succeeded. This change modifies the protocol of the DistSender to flag in subsequent retries that a batch with a commit has previously experienced ambiguity, as well as the handling of the retried write in the MVCC layer to detect this previous ambiguity and reject retries that change the write timestamp as a non-idempotent replay. The flag allows subsequent retries to "remember" the earlier ambiguous write and evaluate accordingly. The flag allows us to properly handle RPC failures (i.e. ambiguous writes) that occur on commit, as a transaction that is implicitly committed is eligible to be marked as explicitly committed by contending transactions via the `RecoverTxn` operation, resulting in a race between retries by the transaction coordinator and recovery by contending transactions that could result in either incorrectly reporting a transaction as having failed with a `RETRY_SERIALIZABLE` error (despite the possibility that it already was or could be recovered and successfully committed), or in attempting to explicitly commit an already-recovered and committed transaction, resulting in seeing an assertion failure due to `transaction unexpectedly committed`. The replay protection introduced here allows us to avoid both of these situations by detecting a replay that should be considered non-idempotent and returning an error, causing the original RPC error remembered by the DistSender to be propagated as an `AmbiguousResultError`. As such, this can be handled by application code by validating the success/failure of a transaction when receiving this error. Depends on #107680, #107323, #108154, #108001 Fixes: #103817 Release note (bug fix): Properly handles RPC failures on writes using the parallel commit protocol that execute in parallel to the commit operation, avoiding incorrect retriable failures and `transaction unexpectedly committed` assertions by detecting when writes cannot be retried idempotently, instead returning an `AmbiguousResultError`. Co-authored-by: Alex Sarkesian <[email protected]>

This adds a unit test to reproduce the behavior described in cockroachdb#103817 and seen in cockroachdb#67765, which currently is a bug in our implementation of the parallel commit protocol that results in the assertion failure known as `transaction unexpectedly committed`. The test currently validates the incorrect behavior of the known issue, though it is inded to be used to validate the potential fixes as proposed in cockroachdb#103817. Release note: None Part of: cockroachdb#103817

While previously, RPC failures were assumed to be retryable, as write operations (with the notable exception of `EndTxn`) were assumed to be idempotent, it has been seen in cockroachdb#67765 and documented in cockroachdb#103817 that RPC failures on write operations that occur in parallel with a commit (i.e. a partial batch where `withCommit==true`), it is not always possible to assume idempotency and retry the "ambiguous" writes. This is due to the fact that the retried write RPC could result in the transaction's `WriteTimestamp` being bumped, changing the commit timestamp of the transaction that may in fact already be implicitly committed if the initial "ambiguous" write actually succeeded. This change modifies the protocol of the DistSender to flag in subsequent retries that a batch with a commit has previously experienced ambiguity, as well as the handling of the retried write in the MVCC layer to detect this previous ambiguity and reject retries that change the write timestamp as a non-idempotent replay. The flag allows subsequent retries to "remember" the earlier ambiguous write and evaluate accordingly. The flag allows us to properly handle RPC failures (i.e. ambiguous writes) that occur on commit, as a transaction that is implicitly committed is eligible to be marked as explicitly committed by contending transactions via the `RecoverTxn` operation, resulting in a race between retries by the transaction coordinator and recovery by contending transactions that could result in either incorrectly reporting a transaction as having failed with a `RETRY_SERIALIZABLE` error (despite the possibility that it already was or could be recovered and successfully committed), or in attempting to explicitly commit an already-recovered and committed transaction, resulting in seeing an assertion failure due to `transaction unexpectedly committed`. The replay protection introduced here allows us to avoid both of these situations by detecting a replay that should be considered non-idempotent and returning an error, causing the original RPC error remembered by the DistSender to be propagated as an `AmbiguousResultError`. As such, this can be handled by application code by validating the success/failure of a transaction when receiving this error. Depends on cockroachdb#107680, cockroachdb#107323, cockroachdb#108154, cockroachdb#108001 Fixes: cockroachdb#103817 Release note (bug fix): Properly handles RPC failures on writes using the parallel commit protocol that execute in parallel to the commit operation, avoiding incorrect retryable failures and `transaction unexpectedly committed` assertions by detecting when writes cannot be retried idempotently, instead returning an `AmbiguousResultError`.

This adds a unit test to reproduce the behavior described in cockroachdb#103817 and seen in cockroachdb#67765, which currently is a bug in our implementation of the parallel commit protocol that results in the assertion failure known as `transaction unexpectedly committed`. The test currently validates the incorrect behavior of the known issue, though it is inded to be used to validate the potential fixes as proposed in cockroachdb#103817. Release note: None Part of: cockroachdb#103817

While previously, RPC failures were assumed to be retryable, as write operations (with the notable exception of `EndTxn`) were assumed to be idempotent, it has been seen in cockroachdb#67765 and documented in cockroachdb#103817 that RPC failures on write operations that occur in parallel with a commit (i.e. a partial batch where `withCommit==true`), it is not always possible to assume idempotency and retry the "ambiguous" writes. This is due to the fact that the retried write RPC could result in the transaction's `WriteTimestamp` being bumped, changing the commit timestamp of the transaction that may in fact already be implicitly committed if the initial "ambiguous" write actually succeeded. This change modifies the protocol of the DistSender to flag in subsequent retries that a batch with a commit has previously experienced ambiguity, as well as the handling of the retried write in the MVCC layer to detect this previous ambiguity and reject retries that change the write timestamp as a non-idempotent replay. The flag allows subsequent retries to "remember" the earlier ambiguous write and evaluate accordingly. The flag allows us to properly handle RPC failures (i.e. ambiguous writes) that occur on commit, as a transaction that is implicitly committed is eligible to be marked as explicitly committed by contending transactions via the `RecoverTxn` operation, resulting in a race between retries by the transaction coordinator and recovery by contending transactions that could result in either incorrectly reporting a transaction as having failed with a `RETRY_SERIALIZABLE` error (despite the possibility that it already was or could be recovered and successfully committed), or in attempting to explicitly commit an already-recovered and committed transaction, resulting in seeing an assertion failure due to `transaction unexpectedly committed`. The replay protection introduced here allows us to avoid both of these situations by detecting a replay that should be considered non-idempotent and returning an error, causing the original RPC error remembered by the DistSender to be propagated as an `AmbiguousResultError`. As such, this can be handled by application code by validating the success/failure of a transaction when receiving this error. Depends on cockroachdb#107680, cockroachdb#107323, cockroachdb#108154, cockroachdb#108001 Fixes: cockroachdb#103817 Release note (bug fix): Properly handles RPC failures on writes using the parallel commit protocol that execute in parallel to the commit operation, avoiding incorrect retryable failures and `transaction unexpectedly committed` assertions by detecting when writes cannot be retried idempotently, instead returning an `AmbiguousResultError`.

This adds a unit test to reproduce the behavior described in cockroachdb#103817 and seen in cockroachdb#67765, which currently is a bug in our implementation of the parallel commit protocol that results in the assertion failure known as `transaction unexpectedly committed`. The test currently validates the incorrect behavior of the known issue, though it is inded to be used to validate the potential fixes as proposed in cockroachdb#103817. Release note: None Part of: cockroachdb#103817

While previously, RPC failures were assumed to be retryable, as write operations (with the notable exception of `EndTxn`) were assumed to be idempotent, it has been seen in cockroachdb#67765 and documented in cockroachdb#103817 that RPC failures on write operations that occur in parallel with a commit (i.e. a partial batch where `withCommit==true`), it is not always possible to assume idempotency and retry the "ambiguous" writes. This is due to the fact that the retried write RPC could result in the transaction's `WriteTimestamp` being bumped, changing the commit timestamp of the transaction that may in fact already be implicitly committed if the initial "ambiguous" write actually succeeded. This change modifies the protocol of the DistSender to flag in subsequent retries that a batch with a commit has previously experienced ambiguity, as well as the handling of the retried write in the MVCC layer to detect this previous ambiguity and reject retries that change the write timestamp as a non-idempotent replay. The flag allows subsequent retries to "remember" the earlier ambiguous write and evaluate accordingly. The flag allows us to properly handle RPC failures (i.e. ambiguous writes) that occur on commit, as a transaction that is implicitly committed is eligible to be marked as explicitly committed by contending transactions via the `RecoverTxn` operation, resulting in a race between retries by the transaction coordinator and recovery by contending transactions that could result in either incorrectly reporting a transaction as having failed with a `RETRY_SERIALIZABLE` error (despite the possibility that it already was or could be recovered and successfully committed), or in attempting to explicitly commit an already-recovered and committed transaction, resulting in seeing an assertion failure due to `transaction unexpectedly committed`. The replay protection introduced here allows us to avoid both of these situations by detecting a replay that should be considered non-idempotent and returning an error, causing the original RPC error remembered by the DistSender to be propagated as an `AmbiguousResultError`. As such, this can be handled by application code by validating the success/failure of a transaction when receiving this error. Depends on cockroachdb#107680, cockroachdb#107323, cockroachdb#108154, cockroachdb#108001 Fixes: cockroachdb#103817 Release note (bug fix): Properly handles RPC failures on writes using the parallel commit protocol that execute in parallel to the commit operation, avoiding incorrect retryable failures and `transaction unexpectedly committed` assertions by detecting when writes cannot be retried idempotently, instead returning an `AmbiguousResultError`.

This adds a unit test to reproduce the behavior described in cockroachdb#103817 and seen in cockroachdb#67765, which currently is a bug in our implementation of the parallel commit protocol that results in the assertion failure known as `transaction unexpectedly committed`. The test currently validates the incorrect behavior of the known issue, though it is inded to be used to validate the potential fixes as proposed in cockroachdb#103817. Release note: None Part of: cockroachdb#103817

While previously, RPC failures were assumed to be retryable, as write operations (with the notable exception of `EndTxn`) were assumed to be idempotent, it has been seen in cockroachdb#67765 and documented in cockroachdb#103817 that RPC failures on write operations that occur in parallel with a commit (i.e. a partial batch where `withCommit==true`), it is not always possible to assume idempotency and retry the "ambiguous" writes. This is due to the fact that the retried write RPC could result in the transaction's `WriteTimestamp` being bumped, changing the commit timestamp of the transaction that may in fact already be implicitly committed if the initial "ambiguous" write actually succeeded. This change modifies the protocol of the DistSender to flag in subsequent retries that a batch with a commit has previously experienced ambiguity, as well as the handling of the retried write in the MVCC layer to detect this previous ambiguity and reject retries that change the write timestamp as a non-idempotent replay. The flag allows subsequent retries to "remember" the earlier ambiguous write and evaluate accordingly. The flag allows us to properly handle RPC failures (i.e. ambiguous writes) that occur on commit, as a transaction that is implicitly committed is eligible to be marked as explicitly committed by contending transactions via the `RecoverTxn` operation, resulting in a race between retries by the transaction coordinator and recovery by contending transactions that could result in either incorrectly reporting a transaction as having failed with a `RETRY_SERIALIZABLE` error (despite the possibility that it already was or could be recovered and successfully committed), or in attempting to explicitly commit an already-recovered and committed transaction, resulting in seeing an assertion failure due to `transaction unexpectedly committed`. The replay protection introduced here allows us to avoid both of these situations by detecting a replay that should be considered non-idempotent and returning an error, causing the original RPC error remembered by the DistSender to be propagated as an `AmbiguousResultError`. As such, this can be handled by application code by validating the success/failure of a transaction when receiving this error. Depends on cockroachdb#107680, cockroachdb#107323, cockroachdb#108154, cockroachdb#108001 Fixes: cockroachdb#103817 Release note (bug fix): Properly handles RPC failures on writes using the parallel commit protocol that execute in parallel to the commit operation, avoiding incorrect retryable failures and `transaction unexpectedly committed` assertions by detecting when writes cannot be retried idempotently, instead returning an `AmbiguousResultError`.

cockroach-teamcity added C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report. labels Jul 19, 2021

yuzefovich closed this as completed Jul 21, 2021

knz mentioned this issue Aug 12, 2021

kvclient: v20.2.2: transaction unexpectedly committed #57552

Closed

knz reopened this Aug 12, 2021

blathers-crl bot added the T-kv KV Team label Aug 12, 2021

tbg self-assigned this Dec 8, 2021

tbg mentioned this issue Dec 8, 2021

kvcoord: make "txn unexpectedly committed" assertion opt-out #73603

Merged

This was referenced Dec 14, 2021

release-21.2: kvcoord: make "txn unexpectedly committed" assertion opt-out #73779

Merged

release-21.1: kvcoord: make "txn unexpectedly committed" assertion opt-out #73780

Merged

tbg mentioned this issue Dec 14, 2021

release-20.2: kvcoord: make "txn unexpectedly committed" assertion opt-out #73782

Closed

yuzefovich mentioned this issue Mar 30, 2022

sentry: txn_coord_sender.go:941: log.Fatal: transaction unexpectedly committed, ba: Put [×,×), Delete [×,×), ConditionalPut [×,×), EndTxn(commit) [×], [txn: 288a14e4]. txn: "sql txn" meta={id=288a14e4 pri=0.0... #78504

Closed

AlexTalks mentioned this issue Sep 19, 2023

release-23.1: kvcoord: add test for transaction unexpectedly committed #110874

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

sentry: txn_coord_sender.go:823: log.Fatal: transaction unexpectedly committed #67765

sentry: txn_coord_sender.go:823: log.Fatal: transaction unexpectedly committed #67765

cockroach-teamcity commented Jul 19, 2021 •

edited by RoachietheSupportRoach

Loading

yuzefovich commented Jul 21, 2021

knz commented Aug 12, 2021

mwang1026 commented Aug 20, 2021

tbg commented Dec 8, 2021

tbg commented Jan 6, 2022

sentry: txn_coord_sender.go:823: log.Fatal: transaction unexpectedly committed #67765

sentry: txn_coord_sender.go:823: log.Fatal: transaction unexpectedly committed #67765

Comments

cockroach-teamcity commented Jul 19, 2021 • edited by RoachietheSupportRoach Loading

yuzefovich commented Jul 21, 2021

knz commented Aug 12, 2021

mwang1026 commented Aug 20, 2021

tbg commented Dec 8, 2021

tbg commented Jan 6, 2022

cockroach-teamcity commented Jul 19, 2021 •

edited by RoachietheSupportRoach

Loading