Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sql: panic when reverting schema change jobs on databases and schemas #59415

Closed
cockroach-teamcity opened this issue Jan 26, 2021 · 4 comments · Fixed by #61159
Closed

sql: panic when reverting schema change jobs on databases and schemas #59415

cockroach-teamcity opened this issue Jan 26, 2021 · 4 comments · Fixed by #61159
Assignees
Labels
C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report.

Comments

@cockroach-teamcity
Copy link
Member

This issue was autofiled by Sentry. It represents a crash or reported error on a live cluster with telemetry enabled.

Sentry link: https://sentry.io/organizations/cockroach-labs/issues/2178735398/?referrer=webhooks_plugin

Panic message:

panic.go:679: interface conversion: catalog.MutableDescriptor is *dbdesc.Mutable, not *tabledesc.Mutable
--
*runtime.TypeAssertionError
panic.go:679: *withstack.withStack (top exception)

Stacktrace (expand for inline code snippets):

/usr/local/go/src/runtime/panic.go#L678-L680 in runtime.gopanic
/usr/local/go/src/runtime/iface.go#L254-L256 in runtime.panicdottypeE
/usr/local/go/src/runtime/iface.go#L264-L266 in runtime.panicdottypeI

}
table := desc.(*tabledesc.Mutable)
hydrated, err := tc.hydrateTypesInTableDesc(ctx, txn, table)
in pkg/sql/catalog/descs.(*Collection).GetMutableTableVersionByID
fksByBackrefTable = make(map[descpb.ID][]*descpb.ConstraintToUpdate)
scTable, err := descsCol.GetMutableTableVersionByID(ctx, sc.descID, txn)
if err != nil {
in pkg/sql.(*SchemaChanger).maybeReverseMutations.func1
) error {
if err := f(ctx, txn, descsCol); err != nil {
return err
in pkg/sql.(*SchemaChanger).txnWithModified.func1
}
if err := f(ctx, txn, descsCol); err != nil {
return err
in pkg/sql/catalog/descs.Txn.func1

cockroach/pkg/kv/db.go

Lines 706 to 708 in eda2309

err := txn.exec(ctx, func(ctx context.Context, txn *Txn) error {
return retryable(ctx, txn)
})
in pkg/kv.(*DB).Txn.func1

cockroach/pkg/kv/txn.go

Lines 810 to 812 in eda2309

}
err = fn(ctx, txn)
in pkg/kv.(*Txn).exec

cockroach/pkg/kv/db.go

Lines 705 to 707 in eda2309

txn.SetDebugName("unnamed")
err := txn.exec(ctx, func(ctx context.Context, txn *Txn) error {
return retryable(ctx, txn)
in pkg/kv.(*DB).Txn
for {
if err := db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
defer descsCol.ReleaseAll(ctx)
in pkg/sql/catalog/descs.Txn
ie := sc.ieFactory(ctx, newFakeSessionData())
if err := descs.Txn(ctx, sc.settings, sc.leaseMgr, ie, sc.db, func(
ctx context.Context, txn *kv.Txn, descsCol *descs.Collection,
in pkg/sql.(*SchemaChanger).txnWithModified
) error {
_, err := sc.txnWithModified(ctx, f)
return err
in pkg/sql.(*SchemaChanger).txn
const kvTrace = true // TODO(ajwerner): figure this out
err := sc.txn(ctx, func(ctx context.Context, txn *kv.Txn, descsCol *descs.Collection) error {
fksByBackrefTable = make(map[descpb.ID][]*descpb.ConstraintToUpdate)
in pkg/sql.(*SchemaChanger).maybeReverseMutations
log.Warningf(ctx, "reversing schema change %d due to irrecoverable error: %s", *sc.job.ID(), err)
if errReverse := sc.maybeReverseMutations(ctx, err); errReverse != nil {
return errReverse
in pkg/sql.(*SchemaChanger).rollbackSchemaChange
if rollbackErr := sc.rollbackSchemaChange(ctx, err); rollbackErr != nil {
// From now on, the returned error will be a secondary error of the returned
in pkg/sql.(*SchemaChanger).handlePermanentSchemaChangeError
if rollbackErr := sc.handlePermanentSchemaChangeError(ctx, scErr, p.ExtendedEvalContext()); rollbackErr != nil {
switch {
in pkg/sql.schemaChangeResumer.OnFailOrCancel

cockroach/pkg/jobs/registry.go

Lines 1190 to 1192 in eda2309

defer jm.CurrentlyRunning.Dec(1)
err = resumer.OnFailOrCancel(onFailOrCancelCtx, phs)
}()
in pkg/jobs.(*Registry).stepThroughStateMachine.func2

cockroach/pkg/jobs/registry.go

Lines 1191 to 1193 in eda2309

err = resumer.OnFailOrCancel(onFailOrCancelCtx, phs)
}()
if successOnFailOrCancel := err == nil; successOnFailOrCancel {
in pkg/jobs.(*Registry).stepThroughStateMachine

cockroach/pkg/jobs/registry.go

Lines 1176 to 1178 in eda2309

// better.
return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, StatusReverting, errors.Wrapf(err, "could not mark job %d as succeeded", *job.ID()))
}
in pkg/jobs.(*Registry).stepThroughStateMachine

cockroach/pkg/jobs/registry.go

Lines 1124 to 1126 in eda2309

jm.ResumeCompleted.Inc(1)
return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, StatusSucceeded, nil)
}
in pkg/jobs.(*Registry).stepThroughStateMachine

cockroach/pkg/jobs/adopt.go

Lines 256 to 258 in eda2309

// Run the actual job.
err := r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, status, finalResumeError)
if err != nil {
in pkg/jobs.(*Registry).runJob

cockroach/pkg/jobs/adopt.go

Lines 193 to 195 in eda2309

if err := r.stopper.RunAsyncTask(ctx, job.taskName(), func(ctx context.Context) {
r.runJob(resumeCtx, resumer, resultsCh, errCh, job, status, job.taskName(), nil)
}); err != nil {
in pkg/jobs.(*Registry).resumeJob.func1
f(ctx)
}()
in pkg/util/stop.(*Stopper).RunAsyncTask.func1
/usr/local/go/src/runtime/asm_amd64.s#L1356-L1358 in runtime.goexit

/usr/local/go/src/runtime/panic.go in runtime.gopanic at line 679
/usr/local/go/src/runtime/iface.go in runtime.panicdottypeE at line 255
/usr/local/go/src/runtime/iface.go in runtime.panicdottypeI at line 265
pkg/sql/catalog/descs/collection.go in pkg/sql/catalog/descs.(*Collection).GetMutableTableVersionByID at line 979
pkg/sql/schema_changer.go in pkg/sql.(*SchemaChanger).maybeReverseMutations.func1 at line 1454
pkg/sql/schema_changer.go in pkg/sql.(*SchemaChanger).txnWithModified.func1 at line 1937
pkg/sql/catalog/descs/txn.go in pkg/sql/catalog/descs.Txn.func1 at line 60
pkg/kv/db.go in pkg/kv.(*DB).Txn.func1 at line 707
pkg/kv/txn.go in pkg/kv.(*Txn).exec at line 811
pkg/kv/db.go in pkg/kv.(*DB).Txn at line 706
pkg/sql/catalog/descs/txn.go in pkg/sql/catalog/descs.Txn at line 55
pkg/sql/schema_changer.go in pkg/sql.(*SchemaChanger).txnWithModified at line 1934
pkg/sql/schema_changer.go in pkg/sql.(*SchemaChanger).txn at line 1924
pkg/sql/schema_changer.go in pkg/sql.(*SchemaChanger).maybeReverseMutations at line 1452
pkg/sql/schema_changer.go in pkg/sql.(*SchemaChanger).rollbackSchemaChange at line 806
pkg/sql/schema_changer.go in pkg/sql.(*SchemaChanger).handlePermanentSchemaChangeError at line 717
pkg/sql/schema_changer.go in pkg/sql.schemaChangeResumer.OnFailOrCancel at line 2236
pkg/jobs/registry.go in pkg/jobs.(*Registry).stepThroughStateMachine.func2 at line 1191
pkg/jobs/registry.go in pkg/jobs.(*Registry).stepThroughStateMachine at line 1192
pkg/jobs/registry.go in pkg/jobs.(*Registry).stepThroughStateMachine at line 1177
pkg/jobs/registry.go in pkg/jobs.(*Registry).stepThroughStateMachine at line 1125
pkg/jobs/adopt.go in pkg/jobs.(*Registry).runJob at line 257
pkg/jobs/adopt.go in pkg/jobs.(*Registry).resumeJob.func1 at line 194
pkg/util/stop/stopper.go in pkg/util/stop.(*Stopper).RunAsyncTask.func1 at line 347
/usr/local/go/src/runtime/asm_amd64.s in runtime.goexit at line 1357
Tag Value
Cockroach Release v20.2.4
Cockroach SHA: eda2309
Platform linux amd64
Distribution CCL
Environment v20.2.4
Command server
Go Version ``
# of CPUs
# of Goroutines
@cockroach-teamcity cockroach-teamcity added C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report. labels Jan 26, 2021
@yuzefovich yuzefovich changed the title sentry: panic.go:679: interface conversion: catalog.MutableDescriptor is *dbdesc.Mutable, not *tabledesc.Mutable -- *runtime.TypeAssertionError panic.go:679: *withstack.withStack (top exception) sql/catalog: v20.2.4: interface conversion: catalog.MutableDescriptor is *dbdesc.Mutable, not *tabledesc.Mutable Jan 26, 2021
@thoszhang
Copy link
Contributor

The problem seems to be that a job for dropping a database was rolled back, but we're not actually set up to handle this in OnFailOrCancel and just assume everywhere that we have a table ID.

@thoszhang
Copy link
Contributor

thoszhang commented Jan 28, 2021

The problem is actually not with dropping databases, but with every schema change on a non-table descriptor other than dropping a database. The problem is that we check whether DroppedDatabaseID on the job details is nonempty and return early if it is (which is a vestige of when databases didn't really use the schema changer, due to not being leased, but DROP DATABASE CASCADE got its own job), but not other possible states of the job details.

So RENAME DATABASE will hit this panic because DescID is set, but not for a table. Same for user-defined schemas. I think there's a different but related bug, where trying to revert DROP SCHEMA will hit this and fail:

if details.DescID == descpb.InvalidID {
return errors.AssertionFailedf("job has no database ID or table ID")
}

But that doesn't really have adverse effects besides a confusing error.

@thoszhang thoszhang changed the title sql/catalog: v20.2.4: interface conversion: catalog.MutableDescriptor is *dbdesc.Mutable, not *tabledesc.Mutable sql: panic when reverting jobs to drop databases and schemas Jan 28, 2021
@thoszhang thoszhang changed the title sql: panic when reverting jobs to drop databases and schemas sql: panic when reverting schema change jobs on databases and schemas Jan 28, 2021
@thoszhang thoszhang self-assigned this Feb 2, 2021
@thoszhang
Copy link
Contributor

thoszhang commented Feb 26, 2021

The (sort of) good news is that it seems that this panic no longer happens on master due to our descs.Collection API improvements. It used to be the case that GetMutableTableVersionByID would always type assert into a table without even checking the type:

table := desc.(*tabledesc.Mutable)

But now we get an inscrutable relation [<id>] does not exist internal error.

@ajwerner
Copy link
Contributor

That's right good news that validates the investment in the code.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
C-bug Code not up to spec/doc, specs & docs deemed correct. Solution expected to change code/behavior. O-sentry Originated from an in-the-wild panic report.
Projects
None yet
Development

Successfully merging a pull request may close this issue.

3 participants