From b2c07b6852a8ef560993babde70f5fd670bce2d1 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Tue, 25 Apr 2023 15:51:16 +0000 Subject: [PATCH] storage: fatal on corruption encountered in background Previously, on-disk corruption would only fatal the node if an interator observed it. Corruption encountered by a background job like a compaction would not fatal the node. This can result in busy churning through compactions that repeatedly fail, impacting cluster stability and user query latencies. Now, on-disk corruption results in immediately exiting the node. Epic: none Fixes: #101101 Release note (ops change): When local corruption of data is encountered by a background job, a node will now exit immediately. --- pkg/storage/pebble.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 8170081bc4a6..9b03e8c750c1 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -1192,6 +1192,11 @@ func (p *Pebble) async(fn func()) { func (p *Pebble) makeMetricEtcEventListener(ctx context.Context) pebble.EventListener { return pebble.EventListener{ + BackgroundError: func(err error) { + if errors.Is(err, pebble.ErrCorruption) { + log.Fatalf(ctx, "local corruption detected: %v", err) + } + }, WriteStallBegin: func(info pebble.WriteStallBeginInfo) { atomic.AddInt64(&p.writeStallCount, 1) startNanos := timeutil.Now().UnixNano()