Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kvserver: v20.1.0-beta.3: received ... results, limit was ... #46652

Closed
cockroach-teamcity opened this issue Mar 26, 2020 · 23 comments · Fixed by #48160
Closed

kvserver: v20.1.0-beta.3: received ... results, limit was ... #46652

cockroach-teamcity opened this issue Mar 26, 2020 · 23 comments · Fixed by #48160
Assignees
Labels
C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report.

Comments

@cockroach-teamcity
Copy link
Member

This issue was autofiled by Sentry. It represents a crash or reported error on a live cluster with telemetry enabled.

Sentry link: https://sentry.io/organizations/cockroach-labs/issues/1583480205/?referrer=webhooks_plugin

Panic message:

replica_read.go:95: received %d results, limit was %d | int64; int64

Stacktrace (expand for inline code snippets):

}
br, res, pErr = evaluateBatch(ctx, storagebase.CmdIDKey(""), rw, rec, nil, ba, true /* readOnly */)
in pkg/kv/kvserver.(*Replica).executeReadOnlyBatchWithServersideRefreshes
var result result.Result
br, result, pErr = r.executeReadOnlyBatchWithServersideRefreshes(ctx, rw, rec, ba, spans)
if err := r.handleReadOnlyLocalEvalResult(ctx, ba, result.Local); err != nil {
in pkg/kv/kvserver.(*Replica).executeReadOnlyBatch
br, g, pErr = fn(r, ctx, ba, status, g)
switch t := pErr.GetDetail().(type) {
in pkg/kv/kvserver.(*Replica).executeBatchWithConcurrencyRetries
fn := (*Replica).executeReadOnlyBatch
br, pErr = r.executeBatchWithConcurrencyRetries(ctx, ba, fn)
} else if ba.IsAdmin() {
in pkg/kv/kvserver.(*Replica).sendWithRangeID
) (*roachpb.BatchResponse, *roachpb.Error) {
return r.sendWithRangeID(ctx, r.RangeID, &ba)
}
in pkg/kv/kvserver.(*Replica).Send
br, pErr = repl.Send(ctx, ba)
if pErr == nil {
in pkg/kv/kvserver.(*Store).Send
br, pErr := store.Send(ctx, ba)
if br != nil && br.Error != nil {
in pkg/kv/kvserver.(*Stores).Send
var pErr *roachpb.Error
br, pErr = n.stores.Send(ctx, *args)
if pErr != nil {
in pkg/server.(*Node).batchInternal.func1
return f(ctx)
}
in pkg/util/stop.(*Stopper).RunTaskWithErr
var br *roachpb.BatchResponse
if err := n.stopper.RunTaskWithErr(ctx, "node.Node: batch", func(ctx context.Context) error {
var finishSpan func(*roachpb.BatchResponse)
in pkg/server.(*Node).batchInternal
br, err := n.batchInternal(ctx, args)
in pkg/server.(*Node).Batch
) (*roachpb.BatchResponse, error) {
return a.InternalServer.Batch(ctx, ba)
}
in pkg/rpc.internalClientAdapter.Batch
}
reply, err := iface.Batch(ctx, &ba)
// If we queried a remote node, perform extra validation and
in pkg/kv/kvclient/kvcoord.(*grpcTransport).sendBatch
ba.Replica = client.replica
reply, err := gt.sendBatch(ctx, client.replica.NodeID, iface, ba)
in pkg/kv/kvclient/kvcoord.(*grpcTransport).SendNext
}
br, err := transport.SendNext(ctx, ba)
// maxSeenLeaseSequence tracks the maximum LeaseSequence seen in a
in pkg/kv/kvclient/kvcoord.(*DistSender).sendToReplicas
return ds.sendToReplicas(
ctx,
in pkg/kv/kvclient/kvcoord.(*DistSender).sendRPC
class := rpc.ConnectionClassForKey(desc.RSpan().Key)
br, err := ds.sendRPC(ctx, ba, class, desc.RangeID, replicas, cachedLeaseHolder, withCommit)
if err != nil {
in pkg/kv/kvclient/kvcoord.(*DistSender).sendSingleRange
reply, pErr = ds.sendSingleRange(ctx, ba, desc, withCommit)
in pkg/kv/kvclient/kvcoord.(*DistSender).sendPartialBatch
if !ri.NeedAnother(rs) {
resp := ds.sendPartialBatch(
ctx, ba, rs, ri.Desc(), ri.Token(), withCommit, batchIdx, false, /* needsTruncate */
in pkg/kv/kvclient/kvcoord.(*DistSender).divideAndSendBatchToRanges
} else {
rpl, pErr = ds.divideAndSendBatchToRanges(ctx, ba, rs, withCommit, 0 /* batchIdx */)
}
in pkg/kv/kvclient/kvcoord.(*DistSender).Send
defer gs.mu.Lock()
return gs.wrapped.Send(ctx, ba)
}
in pkg/kv/kvclient/kvcoord.(*txnLockGatekeeper).SendLocked
br, pErr := m.wrapped.SendLocked(ctx, ba)
if pErr != nil {
in pkg/kv/kvclient/kvcoord.(*txnMetricRecorder).SendLocked
if !hasET {
return tc.wrapped.SendLocked(ctx, ba)
}
in pkg/kv/kvclient/kvcoord.(*txnCommitter).SendLocked
}
br, pErr := sr.wrapped.SendLocked(ctx, ba)
in pkg/kv/kvclient/kvcoord.(*txnSpanRefresher).sendLockedWithRefreshAttempts
// Send through wrapped lockedSender. Unlocks while sending then re-locks.
br, pErr := sr.sendLockedWithRefreshAttempts(ctx, ba, maxAttempts)
if pErr != nil {
in pkg/kv/kvclient/kvcoord.(*txnSpanRefresher).SendLocked
// Send through wrapped lockedSender. Unlocks while sending then re-locks.
br, pErr := tp.wrapped.SendLocked(ctx, ba)
in pkg/kv/kvclient/kvcoord.(*txnPipeliner).SendLocked
return s.wrapped.SendLocked(ctx, ba)
}
in pkg/kv/kvclient/kvcoord.(*txnSeqNumAllocator).SendLocked
// Forward the batch through the wrapped lockedSender.
return h.wrapped.SendLocked(ctx, ba)
}
in pkg/kv/kvclient/kvcoord.(*txnHeartbeater).SendLocked
// Send the command through the txnInterceptor stack.
br, pErr := tc.interceptorStack[0].SendLocked(ctx, ba)
in pkg/kv/kvclient/kvcoord.(*TxnCoordSender).Send

cockroach/pkg/kv/db.go

Lines 738 to 740 in fcd74cd

tracing.AnnotateTrace()
br, pErr := sender.Send(ctx, ba)
if pErr != nil {
in pkg/kv.(*DB).sendUsingSender

cockroach/pkg/kv/txn.go

Lines 893 to 895 in fcd74cd

txn.mu.Unlock()
br, pErr := txn.db.sendUsingSender(ctx, ba, sender)
if pErr == nil {
in pkg/kv.(*Txn).Send
sendFn := func(ctx context.Context, ba roachpb.BatchRequest) (*roachpb.BatchResponse, error) {
res, err := txn.Send(ctx, ba)
if err != nil {
in pkg/sql/row.makeKVBatchFetcher.func1
br, err := f.sendFn(ctx, ba)
if err != nil {
in pkg/sql/row.(*txnKVFetcher).fetch
}
if err := f.fetch(ctx); err != nil {
return false, nil, nil, roachpb.Span{}, err
in pkg/sql/row.(*txnKVFetcher).nextBatch
ok, f.kvs, f.batchResponse, f.Span, err = f.nextBatch(ctx)
if err != nil {
in pkg/sql/row.(*KVFetcher).NextKV
for {
ok, rf.kv, _, err = rf.kvFetcher.NextKV(ctx)
if err != nil {
in pkg/sql/row.(*Fetcher).NextKey
// Retrieve the first key.
_, err := rf.NextKey(ctx)
return err
in pkg/sql/row.(*Fetcher).StartScanFrom
}
return rf.StartScanFrom(ctx, &f)
}
in pkg/sql/row.(*Fetcher).StartScan
if tr.maxTimestampAge == 0 {
err = tr.fetcher.StartScan(
ctx, tr.FlowCtx.Txn, tr.spans,
in pkg/sql/rowexec.(*tableReader).Start
}
ctx = pb.self.Start(ctx)
Run(ctx, pb.self, pb.Out.output)
in pkg/sql/execinfra.(*ProcessorBase).Run
}
headProc.Run(ctx)
return nil
in pkg/sql/flowinfra.(*FlowBase).Run
// TODO(radu): this should go through the flow scheduler.
if err := flow.Run(ctx, func() {}); err != nil {
log.Fatalf(ctx, "unexpected error from syncFlow.Start(): %s "+
in pkg/sql.(*DistSQLPlanner).Run
recv.expectedRowsRead = int64(physPlan.TotalEstimatedScannedRows)
return dsp.Run(planCtx, txn, &physPlan, recv, evalCtx, nil /* finishedSetupFn */)
}
in pkg/sql.(*DistSQLPlanner).PlanAndRun
// the planner whether or not to plan remote table readers.
cleanup := ex.server.cfg.DistSQLPlanner.PlanAndRun(
ctx, evalCtx, planCtx, planner.txn, planner.curPlan.plan, recv,
in pkg/sql.(*connExecutor).execWithDistSQLEngine
ex.sessionTracing.TraceExecStart(ctx, "distributed")
bytesRead, rowsRead, err := ex.execWithDistSQLEngine(ctx, planner, stmt.AST.StatementType(), res, distributePlan, progAtomic)
ex.sessionTracing.TraceExecEnd(ctx, res.Err(), res.RowsAffected())
in pkg/sql.(*connExecutor).dispatchToExecutionEngine
p.autoCommit = os.ImplicitTxn.Get() && !ex.server.cfg.TestingKnobs.DisableAutoCommit
if err := ex.dispatchToExecutionEngine(ctx, p, res); err != nil {
return nil, nil, err
in pkg/sql.(*connExecutor).execStmtInOpenState
} else {
ev, payload, err = ex.execStmtInOpenState(ctx, stmt, res, pinfo)
}
in pkg/sql.(*connExecutor).execStmt
stmtCtx := withStatement(ctx, ex.curStmt)
ev, payload, err = ex.execStmt(stmtCtx, curStmt, stmtRes, nil /* pinfo */)
if err != nil {
in pkg/sql.(*connExecutor).execCmd
var err error
if err = ex.execCmd(ex.Ctx()); err != nil {
if err == io.EOF || err == errDrainingComplete {
in pkg/sql.(*connExecutor).run
}()
return h.ex.run(ctx, s.pool, reserved, cancel)
}
in pkg/sql.(*Server).ServeConn
reservedOwned = false // We're about to pass ownership away.
retErr = sqlServer.ServeConn(ctx, connHandler, reserved, cancelConn)
}()
in pkg/sql/pgwire.(*conn).processCommandsAsync.func1

pkg/kv/kvserver/replica_read.go in pkg/kv/kvserver.(*Replica).executeReadOnlyBatchWithServersideRefreshes at line 95
pkg/kv/kvserver/replica_read.go in pkg/kv/kvserver.(*Replica).executeReadOnlyBatch at line 67
pkg/kv/kvserver/replica_send.go in pkg/kv/kvserver.(*Replica).executeBatchWithConcurrencyRetries at line 236
pkg/kv/kvserver/replica_send.go in pkg/kv/kvserver.(*Replica).sendWithRangeID at line 98
pkg/kv/kvserver/replica_send.go in pkg/kv/kvserver.(*Replica).Send at line 36
pkg/kv/kvserver/store_send.go in pkg/kv/kvserver.(*Store).Send at line 204
pkg/kv/kvserver/stores.go in pkg/kv/kvserver.(*Stores).Send at line 188
pkg/server/node.go in pkg/server.(*Node).batchInternal.func1 at line 925
pkg/util/stop/stopper.go in pkg/util/stop.(*Stopper).RunTaskWithErr at line 302
pkg/server/node.go in pkg/server.(*Node).batchInternal at line 913
pkg/server/node.go in pkg/server.(*Node).Batch at line 951
pkg/rpc/context.go in pkg/rpc.internalClientAdapter.Batch at line 537
pkg/kv/kvclient/kvcoord/transport.go in pkg/kv/kvclient/kvcoord.(*grpcTransport).sendBatch at line 199
pkg/kv/kvclient/kvcoord/transport.go in pkg/kv/kvclient/kvcoord.(*grpcTransport).SendNext at line 168
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).sendToReplicas at line 1630
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).sendRPC at line 460
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).sendSingleRange at line 542
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).sendPartialBatch at line 1401
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).divideAndSendBatchToRanges at line 1086
pkg/kv/kvclient/kvcoord/dist_sender.go in pkg/kv/kvclient/kvcoord.(*DistSender).Send at line 737
pkg/kv/kvclient/kvcoord/txn_lock_gatekeeper.go in pkg/kv/kvclient/kvcoord.(*txnLockGatekeeper).SendLocked at line 86
pkg/kv/kvclient/kvcoord/txn_interceptor_metric_recorder.go in pkg/kv/kvclient/kvcoord.(*txnMetricRecorder).SendLocked at line 46
pkg/kv/kvclient/kvcoord/txn_interceptor_committer.go in pkg/kv/kvclient/kvcoord.(*txnCommitter).SendLocked at line 126
pkg/kv/kvclient/kvcoord/txn_interceptor_span_refresher.go in pkg/kv/kvclient/kvcoord.(*txnSpanRefresher).sendLockedWithRefreshAttempts at line 225
pkg/kv/kvclient/kvcoord/txn_interceptor_span_refresher.go in pkg/kv/kvclient/kvcoord.(*txnSpanRefresher).SendLocked at line 183
pkg/kv/kvclient/kvcoord/txn_interceptor_pipeliner.go in pkg/kv/kvclient/kvcoord.(*txnPipeliner).SendLocked at line 224
pkg/kv/kvclient/kvcoord/txn_interceptor_seq_num_allocator.go in pkg/kv/kvclient/kvcoord.(*txnSeqNumAllocator).SendLocked at line 105
pkg/kv/kvclient/kvcoord/txn_interceptor_heartbeater.go in pkg/kv/kvclient/kvcoord.(*txnHeartbeater).SendLocked at line 172
pkg/kv/kvclient/kvcoord/txn_coord_sender.go in pkg/kv/kvclient/kvcoord.(*TxnCoordSender).Send at line 504
pkg/kv/db.go in pkg/kv.(*DB).sendUsingSender at line 739
pkg/kv/txn.go in pkg/kv.(*Txn).Send at line 894
pkg/sql/row/kv_batch_fetcher.go in pkg/sql/row.makeKVBatchFetcher.func1 at line 184
pkg/sql/row/kv_batch_fetcher.go in pkg/sql/row.(*txnKVFetcher).fetch at line 315
pkg/sql/row/kv_batch_fetcher.go in pkg/sql/row.(*txnKVFetcher).nextBatch at line 399
pkg/sql/row/kv_fetcher.go in pkg/sql/row.(*KVFetcher).NextKV at line 87
pkg/sql/row/fetcher.go in pkg/sql/row.(*Fetcher).NextKey at line 598
pkg/sql/row/fetcher.go in pkg/sql/row.(*Fetcher).StartScanFrom at line 588
pkg/sql/row/fetcher.go in pkg/sql/row.(*Fetcher).StartScan at line 484
pkg/sql/rowexec/tablereader.go in pkg/sql/rowexec.(*tableReader).Start at line 163
pkg/sql/execinfra/processorsbase.go in pkg/sql/execinfra.(*ProcessorBase).Run at line 748
pkg/sql/flowinfra/flow.go in pkg/sql/flowinfra.(*FlowBase).Run at line 370
pkg/sql/distsql_running.go in pkg/sql.(*DistSQLPlanner).Run at line 378
pkg/sql/distsql_running.go in pkg/sql.(*DistSQLPlanner).PlanAndRun at line 988
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).execWithDistSQLEngine at line 868
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).dispatchToExecutionEngine at line 766
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).execStmtInOpenState at line 471
pkg/sql/conn_executor_exec.go in pkg/sql.(*connExecutor).execStmt at line 94
pkg/sql/conn_executor.go in pkg/sql.(*connExecutor).execCmd at line 1394
pkg/sql/conn_executor.go in pkg/sql.(*connExecutor).run at line 1323
pkg/sql/conn_executor.go in pkg/sql.(*Server).ServeConn at line 478
pkg/sql/pgwire/conn.go in pkg/sql/pgwire.(*conn).processCommandsAsync.func1 at line 594
Tag Value
Cockroach Release v20.1.0-beta.3
Cockroach SHA: fcd74cd
Platform linux amd64
Distribution CCL
Environment v20.1.0-beta.3
Command server
Go Version go1.13.5
# of CPUs 8
# of Goroutines 219
@cockroach-teamcity cockroach-teamcity added C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report. labels Mar 26, 2020
@yuzefovich yuzefovich changed the title sentry: replica_read.go:95: received %d results, limit was %d | int64; int64 kvserver: v20.1.0-beta.3: received ... results, limit was ... Mar 27, 2020
@yuzefovich
Copy link
Member

cc @nvanbenschoten @andreimatei not sure how concerning this is

@andreimatei
Copy link
Contributor

Most concerning, I'd say, since the sentry report already has two events from distinct clusters. Both events show exactly the same query fingerprint; I don't know what to make of that.
@tbg has touched code around here in #45008, which is in beta3 but not beta2. Mind combing the sucker?

I'll add it to the release blockers.

@tbg
Copy link
Member

tbg commented Mar 30, 2020

Is there any way to see whether pebble was perhaps used here? That the fingerprint looks the same would suggest to me that this is the same user (perhaps having recreated the cluster). I took a look at the batch evaluation code but nothing jumped out at me, so that I started looking into whether there's maybe a problem with the pebble MVCC scanner. Didn't find anything there neither, but it occurred to me that we ought to be able to tell from a sentry report whether pebble was active. @petermattis that seems to be something storage should look to get into the release. I believe that's achieved by massaging extraDetails here:

msg, details, extraDetails := errors.BuildSentryReport(err)
log.SendReport(ctx, msg, log.ReportTypeError, extraDetails, details...)

@tbg
Copy link
Member

tbg commented Mar 30, 2020

The other far-fetched possibility I considered is a data race, where DistSender changes MaxSpanRequestKeys when resending a request (note how the stack is purely local, in both events). But this can't be it, we pass the field by value through several layers of DistSender (and besides, don't parallelize limited requests)

tbg added a commit to tbg/cockroach that referenced this issue Mar 30, 2020
Downgrade the assertion to returning an error to the user, asking nicely
for providing their repro on the issue. We keep reporting to sentry
despite not terminating the process.

Touches cockroachdb#46652.

Release justification: low-risk reporting improvement
Release note: None
@tbg
Copy link
Member

tbg commented Mar 30, 2020

I looked some more and didn't find anything. It's likely that I did break something (or expose something that was previously not exercised). PR #46720 downgrades the assertion (but keeps reporting) to, hopefully, resolve this soon with external input.

@petermattis
Copy link
Collaborator

Is there any way to see whether pebble was perhaps used here?

We do have telemetry about Pebble usage. I like the idea of including it in the sentry report too.

tbg added a commit to tbg/cockroach that referenced this issue Apr 1, 2020
Downgrade the assertion to returning an error to the user, asking nicely
for providing their repro on the issue. We keep reporting to sentry
despite not terminating the process.

Touches cockroachdb#46652.

Release justification: low-risk reporting improvement
Release note: None
craig bot pushed a commit that referenced this issue Apr 1, 2020
46720: kvserver: improve reporting for an assertion r=andreimatei a=tbg

Downgrade the assertion to returning an error to the user, asking nicely
for providing their repro on the issue. We keep reporting to sentry
despite not terminating the process.

Touches #46652.

Release justification: low-risk reporting improvement
Release note: None

Co-authored-by: Tobias Schottdorf <[email protected]>
tbg added a commit to tbg/cockroach that referenced this issue Apr 3, 2020
Downgrade the assertion to returning an error to the user, asking nicely
for providing their repro on the issue. We keep reporting to sentry
despite not terminating the process.

Touches cockroachdb#46652.

Release justification: low-risk reporting improvement
Release note: None
@nvanbenschoten
Copy link
Member

Now that #46976 has landed, do we feel comfortable removing it from the release blocker list? Or is there more to do here before the release?

@tbg
Copy link
Member

tbg commented Apr 6, 2020

Judging by the report, it seems likely that this is a bug that users will hit "quickly". If we had randomized testing for read-only batches with limits and it failed to reproduce, I would be comfortable releasing. As is, I worry that the moment 20.1 gets out there someone will hit it and we'll have to rush in.

Going to look at randomized testing tomorrow. It shouldn't be too hard to write something quick and dirty that can tell us more. Also will inspect the code paths again.

Do you think kvnemeses can be used for this easily?

@nvanbenschoten
Copy link
Member

Do you think kvnemeses can be used for this easily?

If I recall correctly, kvnemesis doesn't currently support ranged operations, so there would be some work to do. Adding such support shouldn't be too difficult though, and it does seem like the right tool for the job.

@tbg
Copy link
Member

tbg commented Apr 7, 2020

I hacked something up (see #47120) and it's definitely sending lots of random scans, but I haven't gotten a repro, at least not immediately over hundreds of runs, see #47120.

tbg added a commit to tbg/cockroach that referenced this issue Apr 7, 2020
Hack up just enough support for scans to see if we can perhaps tickle
the release blocker here:

cockroachdb#46652

No validation was added. We're just using kvnemeses to send random
scans for us.

Release note: None
@tbg
Copy link
Member

tbg commented Apr 7, 2020

status update: asked @nvanbenschoten to do a code audit pass to see if he can spot a bug in my refactors. If that fails to turn up anything I suggest we release.
NB: there haven't been new occurrences on Sentry other than those two (which were likely generated by the same user), though rc3 adoption is likely low.

@nvanbenschoten
Copy link
Member

I don't have anything new to report here, but I did just find #40958, which may be relevant.

@nvanbenschoten
Copy link
Member

Nothing is jumping out at me here. RocksDB and Pebble's implementation of MVCCScan both look good. So does evaluateBatch.

My only suggestions are that you:

  1. adapt [dnm] use scans in kvnemeses #47120 to ReverseScan
  2. hack around https://github.com/cockroachdb/cockroach/pull/47120/files#diff-e04ef583219215b6b7c19c4312e683dcR171 to assign a limit to batches

We know from the stacktrace that the request was a BatchRequest with a MaxSpanRequestKeys (and TargetBytes) consisting of either all ScanRequests or all ReverseScanRequests, so we should test all of those cases.

@tbg
Copy link
Member

tbg commented Apr 7, 2020 via email

@tbg
Copy link
Member

tbg commented Apr 8, 2020

Hmm, in #40958 the user sort of claims that they were running with faulty memory? I don't trust that very much, but it's good to see that we saw this issue well before I made changes in this area. This is further evidence that it should not block the release.

Will look into your suggestions now.

@tbg
Copy link
Member

tbg commented Apr 8, 2020

I found more real bugs after putting in your suggestions, see #47120, though no repro for this one.

@tbg
Copy link
Member

tbg commented Apr 15, 2020

Not really for this issue (here we're looking at a read-only batch) but what RevertRange does is super suspicious because that command isn't even MVCC aware, and doesn't want the usual semantics. It will happily go through the whole keyspace range by range until it hits its limit on one of the ranges. Then, once it's hit the limit, subsequent requests in the same batch still clear the first key (above this code).

// If, and only if, we're returning a resume span do we want to return >0
// NumKeys. Distsender will reduce the limit for subsequent requests by the
// amount returned, but that doesn't really make sense for RevertRange:
// there isn't some combined result set size we're trying to hit across many
// requests; just because some earlier range ran X Clears that does not mean
// we want the next range to run fewer than the limit chosen for the batch
// size reasons. On the otherhand, we have to pass MaxKeys though if we
// return a resume span to cause distsender to stop after this request, as
// currently response combining's handling of resume spans prefers that
// there only be one. Thus we just set it to MaxKeys when, and only when,
// we're returning a ResumeSpan.
reply.NumKeys = cArgs.Header.MaxSpanRequestKeys
reply.ResumeReason = roachpb.RESUME_KEY_LIMIT

for _, span := range spans {
b.AddRawRequest(&roachpb.RevertRangeRequest{
RequestHeader: roachpb.RequestHeader{
Key: span.Key,
EndKey: span.EndKey,
},
TargetTime: targetTime,
})
}
b.Header.MaxSpanRequestKeys = batchSize

@knz
Copy link
Contributor

knz commented Apr 29, 2020

Looks like RC2 is still affected: #48121

@tbg
Copy link
Member

tbg commented Apr 29, 2020

Oh nice. Ok, here's the rub: using rocksdb, and start-single-node,

received %d results, limit was %d (original limit: %d, batch=%s idx=%d)
-- arg 1: 7
-- arg 2: 3
-- arg 3: 3
-- arg 4: 1 Scan
-- arg 5: 0

We received 7 results, wanted 3, the original limit was also 3 (duh), and the batch was just a single Scan

Very surprising. The report was triggered by a long SELECT statement.

@tbg
Copy link
Member

tbg commented Apr 29, 2020

There are a dozen or so repros from this, all on young clusters (start-single-node), so it's likely one person/entity able to hit this bug and they just do it over and over. The details are always either the above one or this (which is same just with different numbers):

received %d results, limit was %d (original limit: %d, batch=%s idx=%d)
-- arg 1: 4
-- arg 2: 3
-- arg 3: 3
-- arg 4: 1 Scan
-- arg 5: 0

The statements triggering this are:

  1. SELECT _._, ............, _._ FROM _._ LIMIT _
  2. SELECT (_) AS _ FROM _ WHERE ((_._ >= _) AND (_._ <= _)) LIMIT _

The second one is also what triggered it in the first report in this thread.

@tbg
Copy link
Member

tbg commented Apr 29, 2020

I must say this is weird. Single forward scans are the least likely to hit any kind of bug here. The three clusters that generated all the reports have had no involvement in any other sentry reports (i.e. they don't have busted disks or something like that). I hope the error, which links to this issue, motivates the person running those queries to come forward and file a report here.

@tbg
Copy link
Member

tbg commented Apr 29, 2020

Looked at the code again and no idea how this would happen except for a bug at the MVCC layer or below. I was hoping that somehow, magically this could be caused by the internal read retry here

https://github.com/cockroachdb/cockroach/blob/a72b871dfb16322eb54497ab99c03107e11dc36a/pkg/kv/kvserver/replica_read.go#L134-L133

but after injecting an artificial retry there it didn't pan out (just as reading the code suggested it would not).

@tbg tbg removed their assignment Apr 29, 2020
@tbg
Copy link
Member

tbg commented Apr 29, 2020

@itsbilal seems to have figured it out (or at least found a plausible bug). 🙇‍♂️ and handing over.

itsbilal added a commit to itsbilal/cockroach that referenced this issue Apr 29, 2020
We weren't checking for MaxKeys (or TargetBytes) being reached
in the case where we read from intent history in the MVCC scanner.
All other cases go through addAndAdvance(), which had these checks.

Almost certainly fixes cockroachdb#46652. Would be very surprised if it was
something else.

Release note (bug fix): Fixes a bug where a read operation in a transaction
would error out for exceeding the maximum count of results resturned.
craig bot pushed a commit that referenced this issue Apr 29, 2020
46992: sql: Add Logical Column ID field to ColumnDescriptor r=rohany a=RichardJCai

The LogicalColumnID field mimics the ColumnID field however LogicalColumnID may be swapped
between two columns whereas ColumnID cannot. LogicalColumnID is referenced for virtual tables
(pg_catalog, information_schema) and most notably affects column ordering for SHOW COLUMNS.

This LogicalColumnID field support swapping the order of two columns - currently only used for
ALTER COLUMN TYPE when a shadow column is created and swapped with it's original column.

Does not affect existing behaviour.

Release note: None

47449: cli: add --cert-principal-map to client commands r=petermattis a=petermattis

Add support for the `--cert-principal-map` flag to the certs and client
commands. Anywhere we were accepting the `--certs-dir` flag, we now also
accept the `--cert-principal-map` flag.

Fixes #47300

Release note (cli change): Support the `--cert-principal-map` flag in
the `cert *` and "client" commands such as `sql`.

48138: keys: support splitting Ranges on tenant-id prefixed keys r=nvanbenschoten a=nvanbenschoten

Fixes #48122.
Relates to #47903.
Relates to #48123.

This PR contains a series of small commits that work towards the introduction of tenant-id prefixed keyspaces and begin the removal of some `keys.TODOSQLCodec` instances. This should be the only time we need to touch C++ throughout this work.

48160: storage,libroach: Check for MaxKeys when reading from intent history r=itsbilal a=itsbilal

We weren't checking for MaxKeys (or TargetBytes) being reached
in the case where we read from intent history in the MVCC scanner.
All other cases go through addAndAdvance(), which had these checks.

Almost certainly fixes #46652. Would be very surprised if it was
something else.

Release note (bug fix): Fixes a bug where a read operation in a transaction
would error out for exceeding the maximum count of results returned.

48162: opt: add rule to eliminate Exists when input has zero rows r=rytaft a=rytaft

This commit adds a new rule, `EliminateExistsZeroRows`, which
converts an `Exists` subquery to False when it's known
that the input produces zero rows.

Informs #47058

Release note (performance improvement): The optimizer can now
detect when an Exists subquery can be eliminated because the input
has zero rows. This leads to better plans in some cases.

Co-authored-by: richardjcai <[email protected]>
Co-authored-by: Peter Mattis <[email protected]>
Co-authored-by: Nathan VanBenschoten <[email protected]>
Co-authored-by: Bilal Akhtar <[email protected]>
Co-authored-by: Rebecca Taft <[email protected]>
@craig craig bot closed this as completed in 3628ed9 Apr 29, 2020
itsbilal added a commit to itsbilal/cockroach that referenced this issue May 1, 2020
Backport of cockroachdb#48160.

We weren't checking for MaxKeys (or TargetBytes) being reached
in the case where we read from intent history in the MVCC scanner.
All other cases go through addAndAdvance(), which had these checks.

Almost certainly fixes cockroachdb#46652. Would be very surprised if it was
something else.

Release note (bug fix): Fixes a bug where a read operation in a transaction
would error out for exceeding the maximum count of results returned.
timgraham added a commit to timgraham/django-cockroachdb that referenced this issue May 7, 2020
timgraham added a commit to timgraham/django-cockroachdb that referenced this issue May 10, 2020
timgraham added a commit to timgraham/django-cockroachdb that referenced this issue May 11, 2020
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report.
Projects
None yet
Development

Successfully merging a pull request may close this issue.

8 participants