kvserver: v20.1.0-beta.3: received ... results, limit was ... #46652

cockroach-teamcity · 2020-03-26T22:29:13Z

This issue was autofiled by Sentry. It represents a crash or reported error on a live cluster with telemetry enabled.

Sentry link: https://sentry.io/organizations/cockroach-labs/issues/1583480205/?referrer=webhooks_plugin

Panic message:

replica_read.go:95: received %d results, limit was %d | int64; int64

Stacktrace (expand for inline code snippets):

cockroach/pkg/kv/kvserver/replica_read.go

Lines 94 to 96 in fcd74cd

    
           } 
        
           br, res, pErr = evaluateBatch(ctx, storagebase.CmdIDKey(""), rw, rec, nil, ba, true /* readOnly */)

in pkg/kv/kvserver.(*Replica).executeReadOnlyBatchWithServersideRefreshes

cockroach/pkg/kv/kvserver/replica_read.go

Lines 66 to 68 in fcd74cd

    
           var result result.Result 
        
           br, result, pErr = r.executeReadOnlyBatchWithServersideRefreshes(ctx, rw, rec, ba, spans) 
        
           if err := r.handleReadOnlyLocalEvalResult(ctx, ba, result.Local); err != nil {

in pkg/kv/kvserver.(*Replica).executeReadOnlyBatch

cockroach/pkg/kv/kvserver/replica_send.go

Lines 235 to 237 in fcd74cd

    
           br, g, pErr = fn(r, ctx, ba, status, g) 
        
           switch t := pErr.GetDetail().(type) {

in pkg/kv/kvserver.(*Replica).executeBatchWithConcurrencyRetries

cockroach/pkg/kv/kvserver/replica_send.go

Lines 97 to 99 in fcd74cd

    
           	fn := (*Replica).executeReadOnlyBatch 
        
           	br, pErr = r.executeBatchWithConcurrencyRetries(ctx, ba, fn) 
        
           } else if ba.IsAdmin() {

in pkg/kv/kvserver.(*Replica).sendWithRangeID

cockroach/pkg/kv/kvserver/replica_send.go

Lines 35 to 37 in fcd74cd

    
           ) (*roachpb.BatchResponse, *roachpb.Error) { 
        
           	return r.sendWithRangeID(ctx, r.RangeID, &ba) 
        
           }

in pkg/kv/kvserver.(*Replica).Send

cockroach/pkg/kv/kvserver/store_send.go

Lines 203 to 205 in fcd74cd

    
           br, pErr = repl.Send(ctx, ba) 
        
           if pErr == nil {

in pkg/kv/kvserver.(*Store).Send

cockroach/pkg/kv/kvserver/stores.go

Lines 187 to 189 in fcd74cd

    
           br, pErr := store.Send(ctx, ba) 
        
           if br != nil && br.Error != nil {

in pkg/kv/kvserver.(*Stores).Send

cockroach/pkg/server/node.go

Lines 924 to 926 in fcd74cd

    
           var pErr *roachpb.Error 
        
           br, pErr = n.stores.Send(ctx, *args) 
        
           if pErr != nil {

in pkg/server.(*Node).batchInternal.func1

cockroach/pkg/util/stop/stopper.go

Lines 301 to 303 in fcd74cd

    
           	return f(ctx) 
        
           }

in pkg/util/stop.(*Stopper).RunTaskWithErr

cockroach/pkg/server/node.go

Lines 912 to 914 in fcd74cd

    
           var br *roachpb.BatchResponse 
        
           if err := n.stopper.RunTaskWithErr(ctx, "node.Node: batch", func(ctx context.Context) error { 
        
           	var finishSpan func(*roachpb.BatchResponse)

in pkg/server.(*Node).batchInternal

cockroach/pkg/server/node.go

Lines 950 to 952 in fcd74cd


	br, err := n.batchInternal(ctx, args)

in pkg/server.(*Node).Batch

cockroach/pkg/rpc/context.go

Lines 536 to 538 in fcd74cd

    
           ) (*roachpb.BatchResponse, error) { 
        
           	return a.InternalServer.Batch(ctx, ba) 
        
           }

in pkg/rpc.internalClientAdapter.Batch

cockroach/pkg/kv/kvclient/kvcoord/transport.go

Lines 198 to 200 in fcd74cd

    
           } 
        
           reply, err := iface.Batch(ctx, &ba) 
        
           // If we queried a remote node, perform extra validation and

in pkg/kv/kvclient/kvcoord.(*grpcTransport).sendBatch

cockroach/pkg/kv/kvclient/kvcoord/transport.go

Lines 167 to 169 in fcd74cd

    
           ba.Replica = client.replica 
        
           reply, err := gt.sendBatch(ctx, client.replica.NodeID, iface, ba)

in pkg/kv/kvclient/kvcoord.(*grpcTransport).SendNext

cockroach/pkg/kv/kvclient/kvcoord/dist_sender.go

Lines 1629 to 1631 in fcd74cd

    
           } 
        
           br, err := transport.SendNext(ctx, ba) 
        
           // maxSeenLeaseSequence tracks the maximum LeaseSequence seen in a

in pkg/kv/kvclient/kvcoord.(*DistSender).sendToReplicas

cockroach/pkg/kv/kvclient/kvcoord/dist_sender.go

Lines 459 to 461 in fcd74cd

    
           return ds.sendToReplicas( 
        
           	ctx,

in pkg/kv/kvclient/kvcoord.(*DistSender).sendRPC

cockroach/pkg/kv/kvclient/kvcoord/dist_sender.go

Lines 541 to 543 in fcd74cd

    
           class := rpc.ConnectionClassForKey(desc.RSpan().Key) 
        
           br, err := ds.sendRPC(ctx, ba, class, desc.RangeID, replicas, cachedLeaseHolder, withCommit) 
        
           if err != nil {

in pkg/kv/kvclient/kvcoord.(*DistSender).sendSingleRange

cockroach/pkg/kv/kvclient/kvcoord/dist_sender.go

Lines 1400 to 1402 in fcd74cd


	reply, pErr = ds.sendSingleRange(ctx, ba, desc, withCommit)

in pkg/kv/kvclient/kvcoord.(*DistSender).sendPartialBatch

cockroach/pkg/kv/kvclient/kvcoord/dist_sender.go

Lines 1085 to 1087 in fcd74cd

    
           if !ri.NeedAnother(rs) { 
        
           	resp := ds.sendPartialBatch( 
        
           		ctx, ba, rs, ri.Desc(), ri.Token(), withCommit, batchIdx, false, /* needsTruncate */

in pkg/kv/kvclient/kvcoord.(*DistSender).divideAndSendBatchToRanges

cockroach/pkg/kv/kvclient/kvcoord/dist_sender.go

Lines 736 to 738 in fcd74cd

    
           } else { 
        
           	rpl, pErr = ds.divideAndSendBatchToRanges(ctx, ba, rs, withCommit, 0 /* batchIdx */) 
        
           }

in pkg/kv/kvclient/kvcoord.(*DistSender).Send

cockroach/pkg/kv/kvclient/kvcoord/txn_lock_gatekeeper.go

Lines 85 to 87 in fcd74cd

    
           	defer gs.mu.Lock() 
        
           	return gs.wrapped.Send(ctx, ba) 
        
           }

in pkg/kv/kvclient/kvcoord.(*txnLockGatekeeper).SendLocked

cockroach/pkg/kv/kvclient/kvcoord/txn_interceptor_metric_recorder.go

Lines 45 to 47 in fcd74cd

    
           br, pErr := m.wrapped.SendLocked(ctx, ba) 
        
           if pErr != nil {

in pkg/kv/kvclient/kvcoord.(*txnMetricRecorder).SendLocked

cockroach/pkg/kv/kvclient/kvcoord/txn_interceptor_committer.go

Lines 125 to 127 in fcd74cd

    
           if !hasET { 
        
           	return tc.wrapped.SendLocked(ctx, ba) 
        
           }

in pkg/kv/kvclient/kvcoord.(*txnCommitter).SendLocked

cockroach/pkg/kv/kvclient/kvcoord/txn_interceptor_span_refresher.go

Lines 224 to 226 in fcd74cd

    
           } 
        
           br, pErr := sr.wrapped.SendLocked(ctx, ba)

in pkg/kv/kvclient/kvcoord.(*txnSpanRefresher).sendLockedWithRefreshAttempts

cockroach/pkg/kv/kvclient/kvcoord/txn_interceptor_span_refresher.go

Lines 182 to 184 in fcd74cd

    
           // Send through wrapped lockedSender. Unlocks while sending then re-locks. 
        
           br, pErr := sr.sendLockedWithRefreshAttempts(ctx, ba, maxAttempts) 
        
           if pErr != nil {

in pkg/kv/kvclient/kvcoord.(*txnSpanRefresher).SendLocked

cockroach/pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner.go

Lines 223 to 225 in fcd74cd

    
           // Send through wrapped lockedSender. Unlocks while sending then re-locks. 
        
           br, pErr := tp.wrapped.SendLocked(ctx, ba)

in pkg/kv/kvclient/kvcoord.(*txnPipeliner).SendLocked

cockroach/pkg/kv/kvclient/kvcoord/txn_interceptor_seq_num_allocator.go

Lines 104 to 106 in fcd74cd

    
           	return s.wrapped.SendLocked(ctx, ba) 
        
           }

in pkg/kv/kvclient/kvcoord.(*txnSeqNumAllocator).SendLocked

cockroach/pkg/kv/kvclient/kvcoord/txn_interceptor_heartbeater.go

Lines 171 to 173 in fcd74cd

    
           	// Forward the batch through the wrapped lockedSender. 
        
           	return h.wrapped.SendLocked(ctx, ba) 
        
           }

in pkg/kv/kvclient/kvcoord.(*txnHeartbeater).SendLocked

cockroach/pkg/kv/kvclient/kvcoord/txn_coord_sender.go

Lines 503 to 505 in fcd74cd

    
           // Send the command through the txnInterceptor stack. 
        
           br, pErr := tc.interceptorStack[0].SendLocked(ctx, ba)

in pkg/kv/kvclient/kvcoord.(*TxnCoordSender).Send

cockroach/pkg/kv/db.go

Lines 738 to 740 in fcd74cd

    
           tracing.AnnotateTrace() 
        
           br, pErr := sender.Send(ctx, ba) 
        
           if pErr != nil {

in pkg/kv.(*DB).sendUsingSender

cockroach/pkg/kv/txn.go

Lines 893 to 895 in fcd74cd

    
           txn.mu.Unlock() 
        
           br, pErr := txn.db.sendUsingSender(ctx, ba, sender) 
        
           if pErr == nil {

in pkg/kv.(*Txn).Send

cockroach/pkg/sql/row/kv_batch_fetcher.go

Lines 183 to 185 in fcd74cd

    
           sendFn := func(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, error) { 
        
           	res, err := txn.Send(ctx, ba) 
        
           	if err != nil {

in pkg/sql/row.makeKVBatchFetcher.func1

cockroach/pkg/sql/row/kv_batch_fetcher.go

Lines 314 to 316 in fcd74cd

    
           br, err := f.sendFn(ctx, ba) 
        
           if err != nil {

in pkg/sql/row.(*txnKVFetcher).fetch

cockroach/pkg/sql/row/kv_batch_fetcher.go

Lines 398 to 400 in fcd74cd

    
           } 
        
           if err := f.fetch(ctx); err != nil { 
        
           	return false, nil, nil, roachpb.Span{}, err

in pkg/sql/row.(*txnKVFetcher).nextBatch

cockroach/pkg/sql/row/kv_fetcher.go

Lines 86 to 88 in fcd74cd

    
           ok, f.kvs, f.batchResponse, f.Span, err = f.nextBatch(ctx) 
        
           if err != nil {

in pkg/sql/row.(*KVFetcher).NextKV

cockroach/pkg/sql/row/fetcher.go

Lines 597 to 599 in fcd74cd

    
           for { 
        
           	ok, rf.kv, _, err = rf.kvFetcher.NextKV(ctx) 
        
           	if err != nil {

in pkg/sql/row.(*Fetcher).NextKey

cockroach/pkg/sql/row/fetcher.go

Lines 587 to 589 in fcd74cd

    
           // Retrieve the first key. 
        
           _, err := rf.NextKey(ctx) 
        
           return err

in pkg/sql/row.(*Fetcher).StartScanFrom

cockroach/pkg/sql/row/fetcher.go

Lines 483 to 485 in fcd74cd

    
           	} 
        
           	return rf.StartScanFrom(ctx, &f) 
        
           }

in pkg/sql/row.(*Fetcher).StartScan

cockroach/pkg/sql/rowexec/tablereader.go

Lines 162 to 164 in fcd74cd

    
           if tr.maxTimestampAge == 0 { 
        
           	err = tr.fetcher.StartScan( 
        
           		ctx, tr.FlowCtx.Txn, tr.spans,

in pkg/sql/rowexec.(*tableReader).Start

cockroach/pkg/sql/execinfra/processorsbase.go

Lines 747 to 749 in fcd74cd

    
           } 
        
           ctx = pb.self.Start(ctx) 
        
           Run(ctx, pb.self, pb.Out.output)

in pkg/sql/execinfra.(*ProcessorBase).Run

cockroach/pkg/sql/flowinfra/flow.go

Lines 369 to 371 in fcd74cd

    
           } 
        
           headProc.Run(ctx) 
        
           return nil

in pkg/sql/flowinfra.(*FlowBase).Run

cockroach/pkg/sql/distsql_running.go

Lines 377 to 379 in fcd74cd

    
           // TODO(radu): this should go through the flow scheduler. 
        
           if err := flow.Run(ctx, func() {}); err != nil { 
        
           	log.Fatalf(ctx, "unexpected error from syncFlow.Start(): %s "+

in pkg/sql.(*DistSQLPlanner).Run

cockroach/pkg/sql/distsql_running.go

Lines 987 to 989 in fcd74cd

    
           	recv.expectedRowsRead = int64(physPlan.TotalEstimatedScannedRows) 
        
           	return dsp.Run(planCtx, txn, &physPlan, recv, evalCtx, nil /* finishedSetupFn */) 
        
           }

in pkg/sql.(*DistSQLPlanner).PlanAndRun

cockroach/pkg/sql/conn_executor_exec.go

Lines 867 to 869 in fcd74cd

    
           // the planner whether or not to plan remote table readers. 
        
           cleanup := ex.server.cfg.DistSQLPlanner.PlanAndRun( 
        
           	ctx, evalCtx, planCtx, planner.txn, planner.curPlan.plan, recv,

in pkg/sql.(*connExecutor).execWithDistSQLEngine

cockroach/pkg/sql/conn_executor_exec.go

Lines 765 to 767 in fcd74cd

    
           ex.sessionTracing.TraceExecStart(ctx, "distributed") 
        
           bytesRead, rowsRead, err := ex.execWithDistSQLEngine(ctx, planner, stmt.AST.StatementType(), res, distributePlan, progAtomic) 
        
           ex.sessionTracing.TraceExecEnd(ctx, res.Err(), res.RowsAffected())

in pkg/sql.(*connExecutor).dispatchToExecutionEngine

cockroach/pkg/sql/conn_executor_exec.go

Lines 470 to 472 in fcd74cd

    
           p.autoCommit = os.ImplicitTxn.Get() && !ex.server.cfg.TestingKnobs.DisableAutoCommit 
        
           if err := ex.dispatchToExecutionEngine(ctx, p, res); err != nil { 
        
           	return nil, nil, err

in pkg/sql.(*connExecutor).execStmtInOpenState

cockroach/pkg/sql/conn_executor_exec.go

Lines 93 to 95 in fcd74cd

    
           } else { 
        
           	ev, payload, err = ex.execStmtInOpenState(ctx, stmt, res, pinfo) 
        
           }

in pkg/sql.(*connExecutor).execStmt

cockroach/pkg/sql/conn_executor.go

Lines 1393 to 1395 in fcd74cd

    
           stmtCtx := withStatement(ctx, ex.curStmt) 
        
           ev, payload, err = ex.execStmt(stmtCtx, curStmt, stmtRes, nil /* pinfo */) 
        
           if err != nil {

in pkg/sql.(*connExecutor).execCmd

cockroach/pkg/sql/conn_executor.go

Lines 1322 to 1324 in fcd74cd

    
           var err error 
        
           if err = ex.execCmd(ex.Ctx()); err != nil { 
        
           	if err == io.EOF || err == errDrainingComplete {

in pkg/sql.(*connExecutor).run

cockroach/pkg/sql/conn_executor.go

Lines 477 to 479 in fcd74cd

    
           	}() 
        
           	return h.ex.run(ctx, s.pool, reserved, cancel) 
        
           }

in pkg/sql.(*Server).ServeConn

cockroach/pkg/sql/pgwire/conn.go

Lines 593 to 595 in fcd74cd

    
           	reservedOwned = false // We're about to pass ownership away. 
        
           	retErr = sqlServer.ServeConn(ctx, connHandler, reserved, cancelConn) 
        
           }()

in pkg/sql/pgwire.(*conn).processCommandsAsync.func1

pkg/kv/kvserver/replica_read.go in pkg/kv/kvserver.(*Replica).executeReadOnlyBatchWithServersideRefreshes at line 95
pkg/kv/kvserver/replica_read.go in pkg/kv/kvserver.(*Replica).executeReadOnlyBatch at line 67
pkg/kv/kvserver/replica_send.go in pkg/kv/kvserver.(*Replica).executeBatchWithConcurrencyRetries at line 236
pkg/kv/kvserver/replica_send.go in pkg/kv/kvserver.(*Replica).sendWithRangeID at line 98
pkg/kv/kvserver/replica_send.go in pkg/kv/kvserver.(*Replica).Send at line 36
pkg/kv/kvserver/store_send.go in pkg/kv/kvserver.(*Store).Send at line 204
pkg/kv/kvserver/stores.go in pkg/kv/kvserver.(*Stores).Send at line 188
pkg/server/node.go in pkg/server.(*Node).batchInternal.func1 at line 925
pkg/util/stop/stopper.go in pkg/util/stop.(*Stopper).RunTaskWithErr at line 302
pkg/server/node.go in pkg/server.(*Node).batchInternal at line 913
pkg/server/node.go in pkg/server.(*Node).Batch at line 951
pkg/rpc/context.go in pkg/rpc.internalClientAdapter.Batch at line 537
pkg/kv/kvclient/kvcoord/transport.go in pkg/kv/kvclient/kvcoord.(*grpcTransport).sendBatch at line 199
pkg/kv/kvclient/kvcoord/transport.go in pkg/kv/kvclient/kvcoord.(*grpcTransport).SendNext at line 168
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).sendToReplicas at line 1630
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).sendRPC at line 460
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).sendSingleRange at line 542
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).sendPartialBatch at line 1401
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).divideAndSendBatchToRanges at line 1086
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).Send at line 737
pkg/kv/kvclient/kvcoord/txn_lock_gatekeeper.go in pkg/kv/kvclient/kvcoord.(*txnLockGatekeeper).SendLocked at line 86
pkg/kv/kvclient/kvcoord/txn_interceptor_metric_recorder.go in pkg/kv/kvclient/kvcoord.(*txnMetricRecorder).SendLocked at line 46
pkg/kv/kvclient/kvcoord/txn_interceptor_committer.go in pkg/kv/kvclient/kvcoord.(*txnCommitter).SendLocked at line 126
pkg/kv/kvclient/kvcoord/txn_interceptor_span_refresher.go in pkg/kv/kvclient/kvcoord.(*txnSpanRefresher).sendLockedWithRefreshAttempts at line 225
pkg/kv/kvclient/kvcoord/txn_interceptor_span_refresher.go in pkg/kv/kvclient/kvcoord.(*txnSpanRefresher).SendLocked at line 183
pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner.go in pkg/kv/kvclient/kvcoord.(*txnPipeliner).SendLocked at line 224
pkg/kv/kvclient/kvcoord/txn_interceptor_seq_num_allocator.go in pkg/kv/kvclient/kvcoord.(*txnSeqNumAllocator).SendLocked at line 105
pkg/kv/kvclient/kvcoord/txn_interceptor_heartbeater.go in pkg/kv/kvclient/kvcoord.(*txnHeartbeater).SendLocked at line 172
pkg/kv/kvclient/kvcoord/txn_coord_sender.go in pkg/kv/kvclient/kvcoord.(*TxnCoordSender).Send at line 504
pkg/kv/db.go in pkg/kv.(*DB).sendUsingSender at line 739
pkg/kv/txn.go in pkg/kv.(*Txn).Send at line 894
pkg/sql/row/kv_batch_fetcher.go in pkg/sql/row.makeKVBatchFetcher.func1 at line 184
pkg/sql/row/kv_batch_fetcher.go in pkg/sql/row.(*txnKVFetcher).fetch at line 315
pkg/sql/row/kv_batch_fetcher.go in pkg/sql/row.(*txnKVFetcher).nextBatch at line 399
pkg/sql/row/kv_fetcher.go in pkg/sql/row.(*KVFetcher).NextKV at line 87
pkg/sql/row/fetcher.go in pkg/sql/row.(*Fetcher).NextKey at line 598
pkg/sql/row/fetcher.go in pkg/sql/row.(*Fetcher).StartScanFrom at line 588
pkg/sql/row/fetcher.go in pkg/sql/row.(*Fetcher).StartScan at line 484
pkg/sql/rowexec/tablereader.go in pkg/sql/rowexec.(*tableReader).Start at line 163
pkg/sql/execinfra/processorsbase.go in pkg/sql/execinfra.(*ProcessorBase).Run at line 748
pkg/sql/flowinfra/flow.go in pkg/sql/flowinfra.(*FlowBase).Run at line 370
pkg/sql/distsql_running.go in pkg/sql.(*DistSQLPlanner).Run at line 378
pkg/sql/distsql_running.go in pkg/sql.(*DistSQLPlanner).PlanAndRun at line 988
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).execWithDistSQLEngine at line 868
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).dispatchToExecutionEngine at line 766
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).execStmtInOpenState at line 471
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).execStmt at line 94
pkg/sql/conn_executor.go in pkg/sql.(*connExecutor).execCmd at line 1394
pkg/sql/conn_executor.go in pkg/sql.(*connExecutor).run at line 1323
pkg/sql/conn_executor.go in pkg/sql.(*Server).ServeConn at line 478
pkg/sql/pgwire/conn.go in pkg/sql/pgwire.(*conn).processCommandsAsync.func1 at line 594

Tag	Value
Cockroach Release	`v20.1.0-beta.3`
Cockroach SHA:	`fcd74cd`
Platform	linux amd64
Distribution	CCL
Environment	v20.1.0-beta.3
Command	server
Go Version	`go1.13.5`
# of CPUs	8
# of Goroutines	219

The text was updated successfully, but these errors were encountered:

yuzefovich · 2020-03-27T21:40:49Z

cc @nvanbenschoten @andreimatei not sure how concerning this is

andreimatei · 2020-03-27T21:57:05Z

Most concerning, I'd say, since the sentry report already has two events from distinct clusters. Both events show exactly the same query fingerprint; I don't know what to make of that.
@tbg has touched code around here in #45008, which is in beta3 but not beta2. Mind combing the sucker?

I'll add it to the release blockers.

tbg · 2020-03-30T08:54:24Z

Is there any way to see whether pebble was perhaps used here? That the fingerprint looks the same would suggest to me that this is the same user (perhaps having recreated the cluster). I took a look at the batch evaluation code but nothing jumped out at me, so that I started looking into whether there's maybe a problem with the pebble MVCC scanner. Didn't find anything there neither, but it occurred to me that we ought to be able to tell from a sentry report whether pebble was active. @petermattis that seems to be something storage should look to get into the release. I believe that's achieved by massaging extraDetails here:

cockroach/pkg/util/errorutil/error.go

Lines 48 to 49 in c097a16

    
           msg, details, extraDetails := errors.BuildSentryReport(err) 
        
           log.SendReport(ctx, msg, log.ReportTypeError, extraDetails, details...)

tbg · 2020-03-30T08:58:54Z

The other far-fetched possibility I considered is a data race, where DistSender changes MaxSpanRequestKeys when resending a request (note how the stack is purely local, in both events). But this can't be it, we pass the field by value through several layers of DistSender (and besides, don't parallelize limited requests)

Downgrade the assertion to returning an error to the user, asking nicely for providing their repro on the issue. We keep reporting to sentry despite not terminating the process. Touches cockroachdb#46652. Release justification: low-risk reporting improvement Release note: None

tbg · 2020-03-30T09:32:22Z

I looked some more and didn't find anything. It's likely that I did break something (or expose something that was previously not exercised). PR #46720 downgrades the assertion (but keeps reporting) to, hopefully, resolve this soon with external input.

petermattis · 2020-03-30T12:34:32Z

Is there any way to see whether pebble was perhaps used here?

We do have telemetry about Pebble usage. I like the idea of including it in the sentry report too.

Downgrade the assertion to returning an error to the user, asking nicely for providing their repro on the issue. We keep reporting to sentry despite not terminating the process. Touches cockroachdb#46652. Release justification: low-risk reporting improvement Release note: None

46720: kvserver: improve reporting for an assertion r=andreimatei a=tbg Downgrade the assertion to returning an error to the user, asking nicely for providing their repro on the issue. We keep reporting to sentry despite not terminating the process. Touches #46652. Release justification: low-risk reporting improvement Release note: None Co-authored-by: Tobias Schottdorf <[email protected]>

Downgrade the assertion to returning an error to the user, asking nicely for providing their repro on the issue. We keep reporting to sentry despite not terminating the process. Touches cockroachdb#46652. Release justification: low-risk reporting improvement Release note: None

nvanbenschoten · 2020-04-06T15:09:44Z

Now that #46976 has landed, do we feel comfortable removing it from the release blocker list? Or is there more to do here before the release?

tbg · 2020-04-06T15:41:12Z

Judging by the report, it seems likely that this is a bug that users will hit "quickly". If we had randomized testing for read-only batches with limits and it failed to reproduce, I would be comfortable releasing. As is, I worry that the moment 20.1 gets out there someone will hit it and we'll have to rush in.

Going to look at randomized testing tomorrow. It shouldn't be too hard to write something quick and dirty that can tell us more. Also will inspect the code paths again.

Do you think kvnemeses can be used for this easily?

nvanbenschoten · 2020-04-06T15:50:48Z

Do you think kvnemeses can be used for this easily?

If I recall correctly, kvnemesis doesn't currently support ranged operations, so there would be some work to do. Adding such support shouldn't be too difficult though, and it does seem like the right tool for the job.

tbg · 2020-04-07T11:48:20Z

I hacked something up (see #47120) and it's definitely sending lots of random scans, but I haven't gotten a repro, at least not immediately over hundreds of runs, see #47120.

Hack up just enough support for scans to see if we can perhaps tickle the release blocker here: cockroachdb#46652 No validation was added. We're just using kvnemeses to send random scans for us. Release note: None

tbg · 2020-04-07T15:58:53Z

status update: asked @nvanbenschoten to do a code audit pass to see if he can spot a bug in my refactors. If that fails to turn up anything I suggest we release.
NB: there haven't been new occurrences on Sentry other than those two (which were likely generated by the same user), though rc3 adoption is likely low.

nvanbenschoten · 2020-04-07T17:52:22Z

I don't have anything new to report here, but I did just find #40958, which may be relevant.

nvanbenschoten · 2020-04-07T18:32:44Z

Nothing is jumping out at me here. RocksDB and Pebble's implementation of MVCCScan both look good. So does evaluateBatch.

My only suggestions are that you:

adapt [dnm] use scans in kvnemeses #47120 to ReverseScan
hack around https://github.com/cockroachdb/cockroach/pull/47120/files#diff-e04ef583219215b6b7c19c4312e683dcR171 to assign a limit to batches

We know from the stacktrace that the request was a BatchRequest with a MaxSpanRequestKeys (and TargetBytes) consisting of either all ScanRequests or all ReverseScanRequests, so we should test all of those cases.

tbg · 2020-04-07T18:37:24Z

Oh, good points. I'll look into it tomorrow

…

On Tue, Apr 7, 2020, 20:32 Nathan VanBenschoten ***@***.***> wrote: Nothing is jumping out at me here. RocksDB and Pebble's implementation of MVCCScan both look good. So does evaluateBatch. My only suggestions are that you: 1. adapt #47120 <#47120> to ReverseScan 2. hack around https://github.com/cockroachdb/cockroach/pull/47120/files#diff-e04ef583219215b6b7c19c4312e683dcR171 to assign a limit to batches We know from the stacktrace that the request was a BatchRequest with a MaxSpanRequestKeys (and TargetBytes) consisting of either all ScanRequests or all ReverseScanRequests, so we should test all of those cases. — You are receiving this because you were assigned. Reply to this email directly, view it on GitHub <#46652 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ABGXPZDP36DO4IRAQLJHL4DRLNWVXANCNFSM4LUSVQMQ> .

tbg · 2020-04-08T10:05:03Z

Hmm, in #40958 the user sort of claims that they were running with faulty memory? I don't trust that very much, but it's good to see that we saw this issue well before I made changes in this area. This is further evidence that it should not block the release.

Will look into your suggestions now.

tbg · 2020-04-08T10:54:58Z

I found more real bugs after putting in your suggestions, see #47120, though no repro for this one.

tbg · 2020-04-15T12:05:36Z

Not really for this issue (here we're looking at a read-only batch) but what RevertRange does is super suspicious because that command isn't even MVCC aware, and doesn't want the usual semantics. It will happily go through the whole keyspace range by range until it hits its limit on one of the ranges. Then, once it's hit the limit, subsequent requests in the same batch still clear the first key (above this code).

cockroach/pkg/kv/kvserver/batcheval/cmd_revert_range.go

Lines 104 to 116 in 6c7b80e

    
           // If, and only if, we're returning a resume span do we want to return >0 
        
           // NumKeys. Distsender will reduce the limit for subsequent requests by the 
        
           // amount returned, but that doesn't really make sense for RevertRange: 
        
           // there isn't some combined result set size we're trying to hit across many 
        
           // requests; just because some earlier range ran X Clears that does not mean 
        
           // we want the next range to run fewer than the limit chosen for the batch 
        
           // size reasons. On the otherhand, we have to pass MaxKeys though if we 
        
           // return a resume span to cause distsender to stop after this request, as 
        
           // currently response combining's handling of resume spans prefers that 
        
           // there only be one. Thus we just set it to MaxKeys when, and only when, 
        
           // we're returning a ResumeSpan. 
        
           reply.NumKeys = cArgs.Header.MaxSpanRequestKeys 
        
           reply.ResumeReason = roachpb.RESUME_KEY_LIMIT

cockroach/pkg/sql/revert.go

Lines 81 to 90 in f176a39

    
           for _, span := range spans { 
        
           	b.AddRawRequest(&roachpb.RevertRangeRequest{ 
        
           		RequestHeader: roachpb.RequestHeader{ 
        
           			Key:    span.Key, 
        
           			EndKey: span.EndKey, 
        
           		}, 
        
           		TargetTime: targetTime, 
        
           	}) 
        
           } 
        
           b.Header.MaxSpanRequestKeys = batchSize

knz · 2020-04-29T10:09:32Z

Looks like RC2 is still affected: #48121

tbg · 2020-04-29T10:18:22Z

Oh nice. Ok, here's the rub: using rocksdb, and start-single-node,

received %d results, limit was %d (original limit: %d, batch=%s idx=%d)
-- arg 1: 7
-- arg 2: 3
-- arg 3: 3
-- arg 4: 1 Scan
-- arg 5: 0

We received 7 results, wanted 3, the original limit was also 3 (duh), and the batch was just a single Scan

Very surprising. The report was triggered by a long SELECT statement.

tbg · 2020-04-29T10:24:00Z

There are a dozen or so repros from this, all on young clusters (start-single-node), so it's likely one person/entity able to hit this bug and they just do it over and over. The details are always either the above one or this (which is same just with different numbers):

received %d results, limit was %d (original limit: %d, batch=%s idx=%d)
-- arg 1: 4
-- arg 2: 3
-- arg 3: 3
-- arg 4: 1 Scan
-- arg 5: 0

The statements triggering this are:

SELECT _._, ............, _._ FROM _._ LIMIT _
SELECT (_) AS _ FROM _ WHERE ((_._ >= _) AND (_._ <= _)) LIMIT _

The second one is also what triggered it in the first report in this thread.

tbg · 2020-04-29T10:26:24Z

I must say this is weird. Single forward scans are the least likely to hit any kind of bug here. The three clusters that generated all the reports have had no involvement in any other sentry reports (i.e. they don't have busted disks or something like that). I hope the error, which links to this issue, motivates the person running those queries to come forward and file a report here.

tbg · 2020-04-29T13:48:39Z

Looked at the code again and no idea how this would happen except for a bug at the MVCC layer or below. I was hoping that somehow, magically this could be caused by the internal read retry here

https://github.com/cockroachdb/cockroach/blob/a72b871dfb16322eb54497ab99c03107e11dc36a/pkg/kv/kvserver/replica_read.go#L134-L133

but after injecting an artificial retry there it didn't pan out (just as reading the code suggested it would not).

tbg · 2020-04-29T14:49:04Z

@itsbilal seems to have figured it out (or at least found a plausible bug). 🙇‍♂️ and handing over.

We weren't checking for MaxKeys (or TargetBytes) being reached in the case where we read from intent history in the MVCC scanner. All other cases go through addAndAdvance(), which had these checks. Almost certainly fixes cockroachdb#46652. Would be very surprised if it was something else. Release note (bug fix): Fixes a bug where a read operation in a transaction would error out for exceeding the maximum count of results resturned.

46992: sql: Add Logical Column ID field to ColumnDescriptor r=rohany a=RichardJCai The LogicalColumnID field mimics the ColumnID field however LogicalColumnID may be swapped between two columns whereas ColumnID cannot. LogicalColumnID is referenced for virtual tables (pg_catalog, information_schema) and most notably affects column ordering for SHOW COLUMNS. This LogicalColumnID field support swapping the order of two columns - currently only used for ALTER COLUMN TYPE when a shadow column is created and swapped with it's original column. Does not affect existing behaviour. Release note: None 47449: cli: add --cert-principal-map to client commands r=petermattis a=petermattis Add support for the `--cert-principal-map` flag to the certs and client commands. Anywhere we were accepting the `--certs-dir` flag, we now also accept the `--cert-principal-map` flag. Fixes #47300 Release note (cli change): Support the `--cert-principal-map` flag in the `cert *` and "client" commands such as `sql`. 48138: keys: support splitting Ranges on tenant-id prefixed keys r=nvanbenschoten a=nvanbenschoten Fixes #48122. Relates to #47903. Relates to #48123. This PR contains a series of small commits that work towards the introduction of tenant-id prefixed keyspaces and begin the removal of some `keys.TODOSQLCodec` instances. This should be the only time we need to touch C++ throughout this work. 48160: storage,libroach: Check for MaxKeys when reading from intent history r=itsbilal a=itsbilal We weren't checking for MaxKeys (or TargetBytes) being reached in the case where we read from intent history in the MVCC scanner. All other cases go through addAndAdvance(), which had these checks. Almost certainly fixes #46652. Would be very surprised if it was something else. Release note (bug fix): Fixes a bug where a read operation in a transaction would error out for exceeding the maximum count of results returned. 48162: opt: add rule to eliminate Exists when input has zero rows r=rytaft a=rytaft This commit adds a new rule, `EliminateExistsZeroRows`, which converts an `Exists` subquery to False when it's known that the input produces zero rows. Informs #47058 Release note (performance improvement): The optimizer can now detect when an Exists subquery can be eliminated because the input has zero rows. This leads to better plans in some cases. Co-authored-by: richardjcai <[email protected]> Co-authored-by: Peter Mattis <[email protected]> Co-authored-by: Nathan VanBenschoten <[email protected]> Co-authored-by: Bilal Akhtar <[email protected]> Co-authored-by: Rebecca Taft <[email protected]>

Backport of cockroachdb#48160. We weren't checking for MaxKeys (or TargetBytes) being reached in the case where we read from intent history in the MVCC scanner. All other cases go through addAndAdvance(), which had these checks. Almost certainly fixes cockroachdb#46652. Would be very surprised if it was something else. Release note (bug fix): Fixes a bug where a read operation in a transaction would error out for exceeding the maximum count of results returned.

cockroach-teamcity added C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report. labels Mar 26, 2020

yuzefovich changed the title ~~sentry: replica_read.go:95: received %d results, limit was %d | int64; int64~~ kvserver: v20.1.0-beta.3: received ... results, limit was ... Mar 27, 2020

andreimatei assigned tbg Mar 27, 2020

andreimatei mentioned this issue Mar 27, 2020

20.1 release blockers list #45599

Closed

83 tasks

tbg mentioned this issue Mar 30, 2020

storage: sentry should show whether pebble was used #46719

Closed

tbg mentioned this issue Mar 30, 2020

kvserver: improve reporting for an assertion #46720

Merged

tbg mentioned this issue Apr 3, 2020

backport-20.1: kvserver: improve reporting for an assertion #46976

Merged

tbg mentioned this issue Apr 7, 2020

[dnm] use scans in kvnemeses #47120

Closed

nvanbenschoten mentioned this issue Apr 14, 2020

roachtest: tpccbench/nodes=9/cpu=4/chaos/partition failed #47471

Closed

tbg assigned itsbilal Apr 29, 2020

tbg removed their assignment Apr 29, 2020

itsbilal mentioned this issue Apr 29, 2020

storage,libroach: Check for MaxKeys when reading from intent history #48160

Merged

itsbilal mentioned this issue Apr 29, 2020

release-20.1: storage,libroach: Check for MaxKeys when reading from intent history #48165

Merged

craig bot closed this as completed in 3628ed9 Apr 29, 2020

timgraham added a commit to timgraham/django-cockroachdb that referenced this issue May 7, 2020

failures from cockroachdb/cockroach#46652

c9766e5

timgraham added a commit to timgraham/django-cockroachdb that referenced this issue May 10, 2020

failures from cockroachdb/cockroach#46652

dedc9cd

timgraham added a commit to timgraham/django-cockroachdb that referenced this issue May 11, 2020

failures from cockroachdb/cockroach#46652

9fef064

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

kvserver: v20.1.0-beta.3: received ... results, limit was ... #46652

kvserver: v20.1.0-beta.3: received ... results, limit was ... #46652

cockroach-teamcity commented Mar 26, 2020

yuzefovich commented Mar 27, 2020

andreimatei commented Mar 27, 2020

tbg commented Mar 30, 2020

tbg commented Mar 30, 2020 •

edited

Loading

tbg commented Mar 30, 2020

petermattis commented Mar 30, 2020

nvanbenschoten commented Apr 6, 2020

tbg commented Apr 6, 2020

nvanbenschoten commented Apr 6, 2020

tbg commented Apr 7, 2020

tbg commented Apr 7, 2020

nvanbenschoten commented Apr 7, 2020

nvanbenschoten commented Apr 7, 2020

tbg commented Apr 7, 2020 via email

tbg commented Apr 8, 2020 •

edited

Loading

tbg commented Apr 8, 2020

tbg commented Apr 15, 2020

knz commented Apr 29, 2020

tbg commented Apr 29, 2020

tbg commented Apr 29, 2020

tbg commented Apr 29, 2020

tbg commented Apr 29, 2020

tbg commented Apr 29, 2020

kvserver: v20.1.0-beta.3: received ... results, limit was ... #46652

kvserver: v20.1.0-beta.3: received ... results, limit was ... #46652

Comments

cockroach-teamcity commented Mar 26, 2020

yuzefovich commented Mar 27, 2020

andreimatei commented Mar 27, 2020

tbg commented Mar 30, 2020

tbg commented Mar 30, 2020 • edited Loading

tbg commented Mar 30, 2020

petermattis commented Mar 30, 2020

nvanbenschoten commented Apr 6, 2020

tbg commented Apr 6, 2020

nvanbenschoten commented Apr 6, 2020

tbg commented Apr 7, 2020

tbg commented Apr 7, 2020

nvanbenschoten commented Apr 7, 2020

nvanbenschoten commented Apr 7, 2020

tbg commented Apr 7, 2020 via email

tbg commented Apr 8, 2020 • edited Loading

tbg commented Apr 8, 2020

tbg commented Apr 15, 2020

knz commented Apr 29, 2020

tbg commented Apr 29, 2020

tbg commented Apr 29, 2020

tbg commented Apr 29, 2020

tbg commented Apr 29, 2020

tbg commented Apr 29, 2020

tbg commented Mar 30, 2020 •

edited

Loading

tbg commented Apr 8, 2020 •

edited

Loading