From 533e96d3c25ad2abc6af3063435a5d56379685aa Mon Sep 17 00:00:00 2001 From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Date: Thu, 30 Jun 2022 12:18:04 +0300 Subject: [PATCH] Online DDL: even more logging (#10615) (#798) * clean error message upon RETRY Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * auditing change of status, change of message, termination, finding stale migration Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --- go/vt/vttablet/onlineddl/executor.go | 6 +++++- go/vt/vttablet/onlineddl/schema.go | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/go/vt/vttablet/onlineddl/executor.go b/go/vt/vttablet/onlineddl/executor.go index a0bcea86635..e8aecaacfb0 100644 --- a/go/vt/vttablet/onlineddl/executor.go +++ b/go/vt/vttablet/onlineddl/executor.go @@ -1698,6 +1698,7 @@ func (e *Executor) readPendingMigrationsUUIDs(ctx context.Context) (uuids []stri // terminateMigration attempts to interrupt and hard-stop a running migration func (e *Executor) terminateMigration(ctx context.Context, onlineDDL *schema.OnlineDDL) (foundRunning bool, err error) { + log.Infof("terminateMigration: request to terminate %s", onlineDDL.UUID) // It's possible the killing the migration fails for whatever reason, in which case // the logic will retry killing it later on. // Whatever happens in this function, this executor stops owning the given migration. @@ -3313,7 +3314,8 @@ func (e *Executor) reviewStaleMigrations(ctx context.Context) error { if err != nil { return err } - message := fmt.Sprintf("stale migration %s: found running but indicates no liveness", onlineDDL.UUID) + log.Infof("reviewStaleMigrations: stale migration found: %s", onlineDDL.UUID) + message := fmt.Sprintf("stale migration %s: found running but indicates no liveness in the past %v minutes", onlineDDL.UUID, staleMigrationMinutes) if onlineDDL.TabletAlias != e.TabletAliasString() { // This means another tablet started the migration, and the migration has failed due to the tablet failure (e.g. primary failover) if err := e.updateTabletFailure(ctx, onlineDDL.UUID); err != nil { @@ -3635,6 +3637,7 @@ func (e *Executor) updateTabletFailure(ctx context.Context, uuid string) error { } func (e *Executor) updateMigrationStatus(ctx context.Context, uuid string, status schema.OnlineDDLStatus) error { + log.Infof("updateMigrationStatus: transitioning migration: %s into status: %s", uuid, string(status)) query, err := sqlparser.ParseAndBind(sqlUpdateMigrationStatus, sqltypes.StringBindVariable(string(status)), sqltypes.StringBindVariable(uuid), @@ -3659,6 +3662,7 @@ func (e *Executor) updateDDLAction(ctx context.Context, uuid string, actionStr s } func (e *Executor) updateMigrationMessage(ctx context.Context, uuid string, message string) error { + log.Infof("updateMigrationMessage: uuid=%s, message=%s", uuid, message) query, err := sqlparser.ParseAndBind(sqlUpdateMessage, sqltypes.StringBindVariable(message), sqltypes.StringBindVariable(uuid), diff --git a/go/vt/vttablet/onlineddl/schema.go b/go/vt/vttablet/onlineddl/schema.go index e4e15167d45..ac9fff7b93b 100644 --- a/go/vt/vttablet/onlineddl/schema.go +++ b/go/vt/vttablet/onlineddl/schema.go @@ -261,6 +261,7 @@ const ( tablet=%a, retries=retries + 1, tablet_failure=0, + message='', ready_timestamp=NULL, started_timestamp=NULL, liveness_timestamp=NULL, @@ -277,6 +278,7 @@ const ( tablet=%a, retries=retries + 1, tablet_failure=0, + message='', ready_timestamp=NULL, started_timestamp=NULL, liveness_timestamp=NULL,