From d1fd1f669c7a82dd126f8cb177e31dae2ef30d6b Mon Sep 17 00:00:00 2001 From: Rahul Aggarwal Date: Fri, 28 Jul 2023 16:30:17 -0400 Subject: [PATCH] storage: Write PreventStartupFile on Node SSTFile Corruption Currently if a node faces sstable corruption, that node will crash and try to automatically restart. Since it is likely that the node may crash again, we would like to prevent the node from attempting to restart itself. As a result, this pr created a `PreventStartupFile` when a node experiences sstable corruption. Fixes: #103899 Release-note: None --- pkg/storage/pebble.go | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index ae3a8ad449ef..abb89067b6d5 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -1268,10 +1268,33 @@ func (p *Pebble) async(fn func()) { }() } +// writePreventStartupFile creates a file that will prevent nodes from automatically restarting after +// experiencing sstable corruption. +func (p *Pebble) writePreventStartupFile(ctx context.Context) { + auxDir := p.GetAuxiliaryDir() + _ = p.MkdirAll(auxDir, os.ModePerm) + path := base.PreventedStartupFile(auxDir) + + preventStartupMsg := fmt.Sprintf(`ATTENTION: + + this node is terminating because of sstable corruption. + Please contact the CockroachDB support team. It is not necessarily safe + to replace this node; cluster data may still be at risk of corruption. + + A file preventing this node from restarting was placed at: + %s + `, path) + + if err := fs.WriteFile(p.unencryptedFS, path, []byte(preventStartupMsg)); err != nil { + log.Warningf(ctx, "%v", err) + } +} + func (p *Pebble) makeMetricEtcEventListener(ctx context.Context) pebble.EventListener { return pebble.EventListener{ BackgroundError: func(err error) { if errors.Is(err, pebble.ErrCorruption) { + p.writePreventStartupFile(ctx) log.Fatalf(ctx, "local corruption detected: %v", err) } },