Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sql: v20.2.0: nil pointer panic in FilterDescriptorState #57639

Closed
cockroach-teamcity opened this issue Dec 7, 2020 · 9 comments
Closed

sql: v20.2.0: nil pointer panic in FilterDescriptorState #57639

cockroach-teamcity opened this issue Dec 7, 2020 · 9 comments
Assignees
Labels
C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report.

Comments

@cockroach-teamcity
Copy link
Member

This issue was autofiled by Sentry. It represents a crash or reported error on a live cluster with telemetry enabled.

Sentry link: https://sentry.io/organizations/cockroach-labs/issues/2077312662/?referrer=webhooks_plugin

Panic message:

*barriers.barrierError
catch.go:29: *withstack.withStack (top exception)
*assert.withAssertionFailure

Stacktrace (expand for inline code snippets):

// get reported to Sentry.
err = errors.HandleAsAssertionFailure(err)
}
in pkg/util/errorutil.ShouldCatch
// manipulate locks.
if ok, e := errorutil.ShouldCatch(r); ok {
err = e
in pkg/sql/opt/optbuilder.(*Builder).Build.func1
/usr/local/go/src/runtime/panic.go#L678-L680 in runtime.gopanic
/usr/local/go/src/runtime/panic.go#L198-L200 in runtime.panicmem
/usr/local/go/src/runtime/signal_unix.go#L393-L395 in runtime.sigpanic
switch {
case desc.Dropped() && !flags.IncludeDropped:
return NewInactiveDescriptorError(ErrDescriptorDropped)
in pkg/sql/catalog.FilterDescriptorState
// OFFLINE if the relevant flag is set.
if err := catalog.FilterDescriptorState(desc, flags.CommonLookupFlags); err != nil {
if flags.Required {
in pkg/sql/catalog/catalogkv.UncachedPhysicalAccessor.GetObjectDesc
phyAccessor := catalogkv.UncachedPhysicalAccessor{}
return phyAccessor.GetObjectDesc(
ctx,
in pkg/sql/catalog/descs.(*Collection).getObjectVersion.func1
if shouldReadFromStore {
return readObjectFromStore()
}
in pkg/sql/catalog/descs.(*Collection).getObjectVersion
) (*tabledesc.Immutable, error) {
desc, err := tc.getObjectVersion(ctx, txn, tn, flags)
if err != nil {
in pkg/sql/catalog/descs.(*Collection).GetTableVersion
}
table, err := a.tc.GetTableVersion(ctx, txn, &a.tableName, flags)
if table == nil {
in pkg/sql/catalog/accessors.(*CachedPhysicalAccessor).GetObjectDesc
// Fallthrough.
return l.Accessor.GetObjectDesc(ctx, txn, settings, codec, db, schema, object, flags)
}
in pkg/sql/catalog/accessors.(*LogicalSchemaAccessor).GetObjectDesc
lookupFlags.CommonLookupFlags.AvoidCached = p.avoidCachedDescriptors
objDesc, err := sc.GetObjectDesc(ctx, p.txn, p.ExecCfg().Settings, p.ExecCfg().Codec, dbName, scName, tbName, lookupFlags)
in pkg/sql.(*planner).LookupObject
namePrefix.SchemaName = Name(scName)
found, result, err := r.LookupObject(ctx, lookupFlags, u.Catalog(), scName, u.Object())
return found, namePrefix, result, err
in pkg/sql/sem/tree.ResolveExisting
) (res tree.NameResolutionResult, prefix tree.ObjectNamePrefix, err error) {
found, prefix, descI, err := tree.ResolveExisting(ctx, un, sc, lookupFlags, sc.CurrentDatabase(), sc.CurrentSearchPath())
if err != nil {
in pkg/sql/catalog/resolver.ResolveExistingObject
un := tn.ToUnresolvedObjectName()
desc, prefix, err := ResolveExistingObject(ctx, sc, un, lookupFlags)
if err != nil || desc == nil {
in pkg/sql/catalog/resolver.ResolveExistingTableObject
lflags := tree.ObjectLookupFlagsWithRequiredTableKind(tree.ResolveAnyTableKind)
desc, err := resolver.ResolveExistingTableObject(ctx, oc.planner, &oc.tn, lflags)
if err != nil {
in pkg/sql.(*optCatalog).ResolveDataSource
}
ds, resName, err := b.catalog.ResolveDataSource(b.ctx, flags, tn)
if err != nil {
in pkg/sql/opt/optbuilder.(*Builder).resolveDataSource
ds, resName := b.resolveDataSource(tn, priv)
switch t := ds.(type) {
in pkg/sql/opt/optbuilder.(*Builder).buildDataSource
outScope = b.buildDataSource(source.Expr, indexFlags, locking, inScope)
in pkg/sql/opt/optbuilder.(*Builder).buildDataSource
) (outScope *scope) {
outScope = b.buildDataSource(tables[0], nil /* indexFlags */, locking, inScope)
in pkg/sql/opt/optbuilder.(*Builder).buildFromTablesRightDeep
}
return b.buildFromTablesRightDeep(tables, locking, inScope)
}
in pkg/sql/opt/optbuilder.(*Builder).buildFromTables
if len(from.Tables) > 0 {
outScope = b.buildFromTables(from.Tables, locking, inScope)
} else {
in pkg/sql/opt/optbuilder.(*Builder).buildFrom
) (outScope *scope) {
fromScope := b.buildFrom(sel.From, locking, inScope)
in pkg/sql/opt/optbuilder.(*Builder).buildSelectClause
case *tree.SelectClause:
outScope = b.buildSelectClause(t, orderBy, locking, desiredTypes, inScope)
in pkg/sql/opt/optbuilder.(*Builder).buildSelectStmtWithoutParens
return b.processWiths(with, inScope, func(inScope *scope) *scope {
return b.buildSelectStmtWithoutParens(
wrapped, orderBy, limit, locking, desiredTypes, inScope,
in pkg/sql/opt/optbuilder.(*Builder).buildSelect.func1
inScope.atRoot = false
outScope := buildStmt(inScope)
inScope.atRoot = prevAtRoot
in pkg/sql/opt/optbuilder.(*Builder).processWiths
return b.processWiths(with, inScope, func(inScope *scope) *scope {
return b.buildSelectStmtWithoutParens(
in pkg/sql/opt/optbuilder.(*Builder).buildSelect
emptyScope := b.allocScope()
outScope = b.buildSelect(sel, locking, nil /* desiredTypes */, emptyScope)
emptyScope.parent = inScope
in pkg/sql/opt/optbuilder.(*Builder).buildView
case cat.View:
return b.buildView(t, &resName, locking, inScope)
in pkg/sql/opt/optbuilder.(*Builder).buildDataSource
outScope = b.buildDataSource(source.Expr, indexFlags, locking, inScope)
in pkg/sql/opt/optbuilder.(*Builder).buildDataSource
) (outScope *scope) {
outScope = b.buildDataSource(tables[0], nil /* indexFlags */, locking, inScope)
in pkg/sql/opt/optbuilder.(*Builder).buildFromTablesRightDeep

pkg/util/errorutil/catch.go in pkg/util/errorutil.ShouldCatch at line 29
pkg/sql/opt/optbuilder/builder.go in pkg/sql/opt/optbuilder.(*Builder).Build.func1 at line 165
/usr/local/go/src/runtime/panic.go in runtime.gopanic at line 679
/usr/local/go/src/runtime/panic.go in runtime.panicmem at line 199
/usr/local/go/src/runtime/signal_unix.go in runtime.sigpanic at line 394
pkg/sql/catalog/descriptor.go in pkg/sql/catalog.FilterDescriptorState at line 201
pkg/sql/catalog/catalogkv/physical_accessor.go in pkg/sql/catalog/catalogkv.UncachedPhysicalAccessor.GetObjectDesc at line 320
pkg/sql/catalog/descs/collection.go in pkg/sql/catalog/descs.(*Collection).getObjectVersion.func1 at line 783
pkg/sql/catalog/descs/collection.go in pkg/sql/catalog/descs.(*Collection).getObjectVersion at line 860
pkg/sql/catalog/descs/collection.go in pkg/sql/catalog/descs.(*Collection).GetTableVersion at line 760
pkg/sql/catalog/accessors/physical_schema_accessors.go in pkg/sql/catalog/accessors.(*CachedPhysicalAccessor).GetObjectDesc at line 132
pkg/sql/catalog/accessors/logical_schema_accessors.go in pkg/sql/catalog/accessors.(*LogicalSchemaAccessor).GetObjectDesc at line 136
pkg/sql/resolver.go in pkg/sql.(*planner).LookupObject at line 209
pkg/sql/sem/tree/name_resolution.go in pkg/sql/sem/tree.ResolveExisting at line 302
pkg/sql/catalog/resolver/resolver.go in pkg/sql/catalog/resolver.ResolveExistingObject at line 153
pkg/sql/catalog/resolver/resolver.go in pkg/sql/catalog/resolver.ResolveExistingTableObject at line 90
pkg/sql/opt_catalog.go in pkg/sql.(*optCatalog).ResolveDataSource at line 207
pkg/sql/opt/optbuilder/util.go in pkg/sql/opt/optbuilder.(*Builder).resolveDataSource at line 586
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildDataSource at line 110
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildDataSource at line 59
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildFromTablesRightDeep at line 1219
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildFromTables at line 1196
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildFrom at line 1123
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildSelectClause at line 1044
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildSelectStmtWithoutParens at line 992
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildSelect.func1 at line 965
pkg/sql/opt/optbuilder/with.go in pkg/sql/opt/optbuilder.(*Builder).processWiths at line 29
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildSelect at line 964
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildView at line 296
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildDataSource at line 128
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildDataSource at line 59
pkg/sql/opt/optbuilder/select.go in pkg/sql/opt/optbuilder.(*Builder).buildFromTablesRightDeep at line 1219
Tag Value
Cockroach Release v20.2.0
Cockroach SHA: 150c591
Platform linux amd64
Distribution CCL
Environment v20.2.0
Command server
Go Version ``
# of CPUs
# of Goroutines
@cockroach-teamcity cockroach-teamcity added C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report. labels Dec 7, 2020
@yuzefovich yuzefovich changed the title sentry: *barriers.barrierError catch.go:29: *withstack.withStack (top exception) *assert.withAssertionFailure sql: v20.2.0: nil pointer panic in FilterDescriptorState Dec 7, 2020
@jordanlewis
Copy link
Member

Tentatively assigning @postamar - this could be a good entrance point into some new parts of the code.

@SpeedyCoder
Copy link

Hey, based on dates and database version I think it was our database that triggered the duplicate issue above #57779. It seems like trying to query all but migration tables in the database causes this exception. Let me know if there is any way we can help resolve this issue. Also is there any way to fix this at the moment without downgrading the cluster?

@nick-jones
Copy link

@jordanlewis Hi, any comment on the above? We'd like to start recovery steps for this database, so not planning to leave it in this state for too much longer.

Also, any indication what causes this issue?

@jordanlewis
Copy link
Member

cc @cockroachdb/sql-schema

@postamar
Copy link
Contributor

I believe I've identified the cause of the problem. Affected versions are v20.2.x. I submitted a PR targeting the release-20.2 branch with a fix, if it's accepted it should make it into the subsequent 20.2 patch release.

Sorry for taking so long on this.

postamar pushed a commit to postamar/cockroach that referenced this issue Jan 14, 2021
Previously, catalog.FilterDescriptorState was sometimes called with
a nil descriptor interface. This patch fixes this. This bug only
affects v20.2.x.

Fixes cockroachdb#57639.

Release note (bug fix): Fixed a nil pointer panic bug involving
catalog.FilterDescriptorState. This bug affected version 20.2 since
v20.2.0.
postamar pushed a commit to postamar/cockroach that referenced this issue Jan 14, 2021
Previously, catalog.FilterDescriptorState was sometimes called with
a nil descriptor interface. This patch fixes this. This bug only
affects v20.2.x.

Fixes cockroachdb#57639.

Release note (bug fix): Fixed a nil pointer panic bug involving
catalog.FilterDescriptorState. This bug affected version 20.2 since
v20.2.0.
@nick-jones
Copy link

nick-jones commented Jan 15, 2021

In case it's of interest, this is how the issue manifests itself:

root@cockroachdb-proxy:26257/db> SELECT COUNT(*) FROM customer;
ERROR: internal error: runtime error: invalid memory address or nil pointer dereference
SQLSTATE: XX000
DETAIL: stack trace:
/go/src/github.com/cockroachdb/cockroach/pkg/util/errorutil/catch.go:29: ShouldCatch()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/builder.go:165: func1()
/usr/local/go/src/runtime/panic.go:679: gopanic()
/usr/local/go/src/runtime/panic.go:199: panicmem()
/usr/local/go/src/runtime/signal_unix.go:394: sigpanic()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/catalog/descriptor.go:204: FilterDescriptorState()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/catalog/catalogkv/physical_accessor.go:320: GetObjectDesc()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/catalog/descs/collection.go:783: func1()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/catalog/descs/collection.go:860: getObjectVersion()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/catalog/descs/collection.go:760: GetTableVersion()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/catalog/accessors/physical_schema_accessors.go:132: GetObjectDesc()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/catalog/accessors/logical_schema_accessors.go:136: GetObjectDesc()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/resolver.go:209: LookupObject()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/sem/tree/name_resolution.go:337: ResolveExisting()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/catalog/resolver/resolver.go:153: ResolveExistingObject()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/catalog/resolver/resolver.go:90: ResolveExistingTableObject()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt_catalog.go:207: ResolveDataSource()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/util.go:587: resolveDataSource()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/select.go:103: buildDataSource()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/select.go:59: buildDataSource()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/select.go:1219: buildFromTablesRightDeep()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/select.go:1196: buildFromTables()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/select.go:1123: buildFrom()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/select.go:1044: buildSelectClause()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/select.go:992: buildSelectStmtWithoutParens()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/select.go:965: func1()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/with.go:29: processWiths()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/select.go:964: buildSelect()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/builder.go:265: buildStmt()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/builder.go:229: buildStmtAtRoot()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/opt/optbuilder/builder.go:200: Build()
/go/src/github.com/cockroachdb/cockroach/pkg/sql/plan_opt.go:512: buildExecMemo()

HINT: You have encountered an unexpected error.

Please check the public issue tracker to check whether this problem is
already tracked. If you cannot find it there, please report the error
with details by creating a new issue.

If you would rather not post publicly, please contact us directly
using the support form.

We appreciate your feedback.
root@cockroachdb-proxy:26257/db> 

This particular database has 5 tables.. 3 of the 5 tables trigger the error. Other databases & tables in the same cluster don't exhibit the issue.

Version:

root@cockroachdb-proxy:26257/db> SELECT VERSION();
                                          version
--------------------------------------------------------------------------------------------
  CockroachDB CCL v20.2.3 (x86_64-unknown-linux-gnu, built 2020/12/14 18:33:39, go1.13.14)
(1 row)

Time: 1ms total (execution 0ms / network 1ms)

@thoszhang
Copy link
Contributor

Hi @nick-jones, thanks for that information.

Was the database that's having problems created on a different cluster version than the others? Or is there anything else special about it that you're aware of?

It would be helpful if you could send us a debug zip. This will contain some sensitive information (metadata), so you can send it to us at [email protected] if you'd prefer to not attach it to this issue.

postamar pushed a commit to postamar/cockroach that referenced this issue Jan 15, 2021
Previously, catalog.FilterDescriptorState was sometimes called with
a nil descriptor interface. This patch fixes this. This bug only
affects v20.2.x.

Fixes cockroachdb#57639.

Release note (bug fix): Fixed a nil pointer panic bug involving
catalog.FilterDescriptorState. This bug affected version 20.2 since
v20.2.0.
@nick-jones
Copy link

nick-jones commented Jan 18, 2021

@lucy-zhang -

Hi @nick-jones, thanks for that information.

Was the database that's having problems created on a different cluster version than the others? Or is there anything else special about it that you're aware of?

We cannot think of anything particularly unusual about this database. We're not sure if it was created against a different cluster version, unfortunately.

It would be helpful if you could send us a debug zip. This will contain some sensitive information (metadata), so you can send it to us at [email protected] if you'd prefer to not attach it to this issue.

No problem, I've emailed it over 👍

postamar pushed a commit to postamar/cockroach that referenced this issue Jan 20, 2021
Previously, catalog.FilterDescriptorState was sometimes called with
a nil descriptor interface. This patch fixes this. This bug only
affects v20.2.x.

Fixes cockroachdb#57639.

Release note (bug fix): Fixed a nil pointer panic bug involving
catalog.FilterDescriptorState. This bug affected version 20.2 since
v20.2.0.
postamar pushed a commit to postamar/cockroach that referenced this issue Jan 21, 2021
Previously, catalog.FilterDescriptorState was sometimes called with
a nil descriptor interface. This patch fixes this. This bug only
affects v20.2.x.

Fixes cockroachdb#57639.

Release note (bug fix): Fixed a nil pointer panic bug involving
catalog.FilterDescriptorState. This bug affected version 20.2 since
v20.2.0.
@thoszhang
Copy link
Contributor

The panic was fixed in #58998. The root cause of the descriptor corruption is unclear. There are pairs of doctor errors like this:

Table <id1>: ParentID <db>, ParentSchemaID <schema>, Name '<name>': not being dropped but no namespace entry found
Descriptor <id2>: has namespace row(s) [{ParentID:<db> ParentSchemaID:<schema> Name:<name>}] but no descriptor

id1 and id2 are not equal (despite them presumably corresponding logically to the "same" table), hence the panic after
retrieving the namespace entry. What's interesting is that the ReplacementOf table ID (formerly, the previous ID of a truncated table) for the descriptor id1 is sometimes id2, but sometimes not.

This points to a truncate-related bug (in the old implementation). There's no DropJobID on these descriptors, though, so it's not #50587 (and other symptoms don't add up). It's also possible there are two different bugs/sources of corruption here.

We don't have any further information to act on, so I'm closing this, but if we see something similar again we can compare.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report.
Projects
None yet
Development

No branches or pull requests

6 participants