diff --git a/br/pkg/lightning/common/retry.go b/br/pkg/lightning/common/retry.go index 19afcd2f90c46..e08fde0c02984 100644 --- a/br/pkg/lightning/common/retry.go +++ b/br/pkg/lightning/common/retry.go @@ -79,6 +79,8 @@ var retryableErrorIDs = map[errors.ErrorID]struct{}{ ErrKVReadIndexNotReady.ID(): {}, ErrKVIngestFailed.ID(): {}, ErrKVRaftProposalDropped.ID(): {}, + // litBackendCtxMgr.Register may return the error. + ErrCreatePDClient.ID(): {}, // during checksum coprocessor will transform error into driver error in handleCopResponse using ToTiDBErr // met ErrRegionUnavailable on free-tier import during checksum, others hasn't met yet drivererr.ErrRegionUnavailable.ID(): {}, diff --git a/pkg/ddl/index.go b/pkg/ddl/index.go index f8b5dca53d97f..aff05bb2ca49d 100644 --- a/pkg/ddl/index.go +++ b/pkg/ddl/index.go @@ -1081,9 +1081,7 @@ func runReorgJobAndHandleErr(w *worker, d *ddlCtx, t *meta.Meta, job *model.Job, // TODO(tangenta): get duplicate column and match index. err = convertToKeyExistsErr(err, allIndexInfos[0], tbl.Meta()) } - if !errorIsRetryable(err, job) || - // TODO: Remove this check make it can be retry. Related test is TestModifyColumnReorgInfo. - job.ReorgMeta.IsDistReorg { + if !errorIsRetryable(err, job) { logutil.BgLogger().Warn("run add index job failed, convert job to rollback", zap.String("category", "ddl"), zap.String("job", job.String()), zap.Error(err)) ver, err = convertAddIdxJob2RollbackJob(d, t, job, tbl.Meta(), allIndexInfos, err) if err1 := rh.RemoveDDLReorgHandle(job, reorgInfo.elements); err1 != nil { diff --git a/pkg/disttask/framework/scheduler/manager.go b/pkg/disttask/framework/scheduler/manager.go index 95c9aa99c9bba..759e70612ab25 100644 --- a/pkg/disttask/framework/scheduler/manager.go +++ b/pkg/disttask/framework/scheduler/manager.go @@ -22,6 +22,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/failpoint" + "github.com/pingcap/tidb/br/pkg/lightning/common" "github.com/pingcap/tidb/pkg/config" "github.com/pingcap/tidb/pkg/disttask/framework/proto" "github.com/pingcap/tidb/pkg/domain/infosync" @@ -416,13 +417,18 @@ func (m *Manager) removeHandlingTask(id int64) { } func (m *Manager) logErr(err error) { - logutil.Logger(m.logCtx).Error("task manager error", zap.Error(err), zap.Stack("stack")) + logutil.Logger(m.logCtx).Error("task manager met error", zap.Error(err), zap.Stack("stack")) } func (m *Manager) logErrAndPersist(err error, taskID int64) { m.logErr(err) + // TODO: use interface if each business to retry + if common.IsRetryableError(err) || isRetryableError(err) { + return + } err1 := m.taskTable.UpdateErrorToSubtask(m.id, taskID, err) if err1 != nil { logutil.Logger(m.logCtx).Error("update to subtask failed", zap.Error(err1), zap.Stack("stack")) } + logutil.Logger(m.logCtx).Error("update error to subtask", zap.Int64("task-id", taskID), zap.Error(err1), zap.Stack("stack")) } diff --git a/pkg/disttask/framework/scheduler/scheduler.go b/pkg/disttask/framework/scheduler/scheduler.go index 6253a2c6cab8b..357c39433a3d5 100644 --- a/pkg/disttask/framework/scheduler/scheduler.go +++ b/pkg/disttask/framework/scheduler/scheduler.go @@ -474,13 +474,13 @@ func (s *BaseScheduler) onError(err error) { return } err = errors.Trace(err) - logutil.Logger(s.logCtx).Error("onError", zap.Error(err)) + logutil.Logger(s.logCtx).Error("onError", zap.Error(err), zap.Stack("stack")) s.mu.Lock() defer s.mu.Unlock() if s.mu.err == nil { s.mu.err = err - logutil.Logger(s.logCtx).Error("scheduler error", zap.Error(err)) + logutil.Logger(s.logCtx).Error("scheduler met first error", zap.Error(err)) } if s.mu.runtimeCancel != nil { @@ -620,5 +620,8 @@ func (s *BaseScheduler) updateErrorToSubtask(ctx context.Context, taskID int64, return true, s.taskTable.UpdateErrorToSubtask(s.id, taskID, err) }, ) + if err1 == nil { + logger.Warn("update error to subtask success", zap.Error(err)) + } return err1 }