diff --git a/event.go b/event.go index ef0c1c0ec2..561df6c1b1 100644 --- a/event.go +++ b/event.go @@ -121,6 +121,8 @@ type DiskSlowInfo struct { Path string // Operation being performed on the file. OpType vfs.OpType + // Size of write in kilobytes, if the write is sized. + WriteSize int // Duration that has elapsed since this disk operation started. Duration time.Duration } @@ -131,8 +133,10 @@ func (i DiskSlowInfo) String() string { // SafeFormat implements redact.SafeFormatter. func (i DiskSlowInfo) SafeFormat(w redact.SafePrinter, _ rune) { - w.Printf("disk slowness detected: %s on file %s has been ongoing for %0.1fs", - redact.Safe(i.OpType.String()), i.Path, redact.Safe(i.Duration.Seconds())) + // TODO(before merge): Should we mention the ceiling of ~1.04 GB here? Any reviewers have an + // opinion? + w.Printf("disk slowness detected: %s on file %s (%d kilobytes) has been ongoing for %0.1fs", + redact.Safe(i.OpType.String()), i.Path, i.WriteSize, redact.Safe(i.Duration.Seconds())) } // FlushInfo contains the info for a flush event. diff --git a/options.go b/options.go index 482018fb97..2f7d9d86f6 100644 --- a/options.go +++ b/options.go @@ -966,11 +966,12 @@ func (o *Options) EnsureDefaults() *Options { if o.FS == nil { o.FS, o.private.fsCloser = vfs.WithDiskHealthChecks(vfs.Default, 5*time.Second, - func(name string, op vfs.OpType, duration time.Duration) { + func(name string, op vfs.OpType, writeSize int, duration time.Duration) { o.EventListener.DiskSlow(DiskSlowInfo{ - Path: name, - OpType: op, - Duration: duration, + Path: name, + OpType: op, + WriteSize: writeSize, + Duration: duration, }) }) } diff --git a/vfs/disk_health.go b/vfs/disk_health.go index dfa5a40c6f..bb43ac9330 100644 --- a/vfs/disk_health.go +++ b/vfs/disk_health.go @@ -25,9 +25,15 @@ const ( preallocatedSlotCount = 16 // nOffsetBits is the number of bits in the packed 64-bit integer used for // identifying an offset from the file creation time (in nanoseconds). - nOffsetBits = 60 + nOffsetBits = 40 + // nWriteSizeBits is the number of bits in the packed 64-bit integer used for + // identifying the size of the write operation, if the operation is sized. + nWriteSizeBits = 20 ) +// Track size of writes at kilobyte precision. See comment above lastWritePacked for more. +var writeSizePrecision = 1000 + // OpType is the type of IO operation being monitored by a // diskHealthCheckingFile. type OpType uint8 @@ -88,21 +94,36 @@ func (o OpType) String() string { type diskHealthCheckingFile struct { File - onSlowDisk func(OpType, time.Duration) + onSlowDisk func(OpType, int, time.Duration) diskSlowThreshold time.Duration tickInterval time.Duration stopper chan struct{} - // lastWritePacked is a 64-bit unsigned int, with the most significant 7.5 - // bytes (60 bits) representing an offset (in nanoseconds) from the file - // creation time. The least significant four bits contains the OpType. + // lastWritePacked is a 64-bit unsigned int. The most significant + // 40 bits represent an offset (in milliseconds) from the creation + // time of the diskHealthCheckingFile. The next most significant 20 bits + // represent the size of the write in KBs, if the write has a size. (If + // it doesn't, the 20 bits are zeroed). The least significant four bits + // contains the OpType. + // + // The use of 40 bits for an offset provides ~34 years of effective + // monitoring time before the uint wraps around, at millisecond precision. + // ~34 years of process uptime "ought to be enough for anybody". Millisecond + // writeSizePrecision is sufficient, given that we are monitoring for writes that take + // longer than one millisecond. + // + // The use of 20 bits for the size in KBs allows representing sizes of up + // to ~1.04 GBs. If the write is larger than that, we round down to ~1.04 GBs. // - // The use of 60 bits for an offset provides ~36.5 years of effective - // monitoring time before the uint wraps around. 36.5 years of process uptime - // "ought to be enough for anybody". This allows for 16 operation types. + // The use of four bits for OpType allows for 16 operation types. // // NB: this packing scheme is not persisted, and is therefore safe to adjust // across process boundaries. + // + // TODO(before merge): 1 KB to ~1.04 GB is range of writes that we can represent via + // this scheme, with bigger writes being rounded down to ~1.04 GB. Not sure how big + // writes in pebble are, on both low side, average side, & high side. Need to check + // on that, & adjust scheme if needed. lastWritePacked uint64 createTime time.Time } @@ -110,7 +131,7 @@ type diskHealthCheckingFile struct { // newDiskHealthCheckingFile instantiates a new diskHealthCheckingFile, with the // specified time threshold and event listener. func newDiskHealthCheckingFile( - file File, diskSlowThreshold time.Duration, onSlowDisk func(OpType, time.Duration), + file File, diskSlowThreshold time.Duration, onSlowDisk func(OpType, int, time.Duration), ) *diskHealthCheckingFile { return &diskHealthCheckingFile{ File: file, @@ -138,19 +159,18 @@ func (d *diskHealthCheckingFile) startTicker() { select { case <-d.stopper: return - case <-ticker.C: packed := atomic.LoadUint64(&d.lastWritePacked) if packed == 0 { continue } - offsetNanos, op := int64(packed>>(64-nOffsetBits)), OpType(packed&0xf) - lastWrite := d.createTime.Add(time.Duration(offsetNanos)) + offsetMillis, writeSize, op := unpack(packed) + lastWrite := d.createTime.Add(time.Duration(offsetMillis)) now := time.Now() if lastWrite.Add(d.diskSlowThreshold).Before(now) { // diskSlowThreshold was exceeded. Call the passed-in // listener. - d.onSlowDisk(op, now.Sub(lastWrite)) + d.onSlowDisk(op, writeSize, now.Sub(lastWrite)) } } } @@ -164,7 +184,7 @@ func (d *diskHealthCheckingFile) stopTicker() { // Write implements the io.Writer interface. func (d *diskHealthCheckingFile) Write(p []byte) (n int, err error) { - d.timeDiskOp(OpTypeWrite, func() { + d.timeDiskOp(OpTypeWrite, len(p), func() { n, err = d.File.Write(p) }) return n, err @@ -178,7 +198,7 @@ func (d *diskHealthCheckingFile) Close() error { // Sync implements the io.Syncer interface. func (d *diskHealthCheckingFile) Sync() (err error) { - d.timeDiskOp(OpTypeSync, func() { + d.timeDiskOp(OpTypeSync, 0, func() { err = d.File.Sync() }) return err @@ -186,23 +206,16 @@ func (d *diskHealthCheckingFile) Sync() (err error) { // timeDiskOp runs the specified closure and makes its timing visible to the // monitoring goroutine, in case it exceeds one of the slow disk durations. -func (d *diskHealthCheckingFile) timeDiskOp(opType OpType, op func()) { +// opType should always be set. writeSizeInBytes should be set if the write +// operation is sized. If not, it should be set to zero. +func (d *diskHealthCheckingFile) timeDiskOp(opType OpType, writeSizeInBytes int, op func()) { if d == nil { op() return } - offsetNanos := time.Since(d.createTime).Nanoseconds() - // We have no guarantee of clock monotonicity. If we have a small regression - // in the clock, we set offsetNanos to zero, so we can still catch the operation - // if happens to be slow. - if offsetNanos < 0 { - offsetNanos = 0 - } - if offsetNanos > 1< 1< 1<>(64-nOffsetBits)), int(packed>>(64-nOffsetBits-nWriteSizeBits))&((1< 1< 1<