From c5434bf7117cfd4d0d83e1b6717763a90cd2f8ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miloslav=20Trma=C4=8D?= Date: Tue, 20 Feb 2024 19:58:55 +0100 Subject: [PATCH 1/2] Expand sparseWriter tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add more test cases - Test that we create the expected (large) holes; don't enforce anything for the --- .../compression/sparse_file_writer_test.go | 56 ++++++++++++------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/pkg/machine/compression/sparse_file_writer_test.go b/pkg/machine/compression/sparse_file_writer_test.go index 70a7c75b61..c14291ac34 100644 --- a/pkg/machine/compression/sparse_file_writer_test.go +++ b/pkg/machine/compression/sparse_file_writer_test.go @@ -3,22 +3,32 @@ package compression import ( "bytes" "errors" + "fmt" "io" "testing" + + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" ) type memorySparseFile struct { buffer bytes.Buffer pos int64 + sparse int64 } func (m *memorySparseFile) Seek(offset int64, whence int) (int64, error) { + logrus.Debugf("Seek %d %d", offset, whence) var newPos int64 switch whence { case io.SeekStart: - newPos = offset + panic("unexpected") case io.SeekCurrent: newPos = m.pos + offset + if offset < -1 { + panic("unexpected") + } + m.sparse += offset case io.SeekEnd: newPos = int64(m.buffer.Len()) + offset default: @@ -34,6 +44,7 @@ func (m *memorySparseFile) Seek(offset int64, whence int) (int64, error) { } func (m *memorySparseFile) Write(b []byte) (n int, err error) { + logrus.Debugf("Write %d", len(b)) if int64(m.buffer.Len()) < m.pos { padding := make([]byte, m.pos-int64(m.buffer.Len())) _, err := m.buffer.Write(padding) @@ -42,8 +53,6 @@ func (m *memorySparseFile) Write(b []byte) (n int, err error) { } } - m.buffer.Next(int(m.pos) - m.buffer.Len()) - n, err = m.buffer.Write(b) m.pos += int64(n) return n, err @@ -53,7 +62,7 @@ func (m *memorySparseFile) Close() error { return nil } -func testInputWithWriteLen(t *testing.T, input []byte, chunkSize int) { +func testInputWithWriteLen(t *testing.T, input []byte, minSparse int64, chunkSize int) { m := &memorySparseFile{} sparseWriter := NewSparseWriter(m) @@ -71,15 +80,16 @@ func testInputWithWriteLen(t *testing.T, input []byte, chunkSize int) { if err != nil { t.Fatalf("Expected no error, got %v", err) } - if !bytes.Equal(input, m.buffer.Bytes()) { - t.Fatalf("Incorrect output") - } + assert.Equal(t, string(input), m.buffer.String()) + assert.GreaterOrEqual(t, m.sparse, minSparse) } -func testInput(t *testing.T, inputBytes []byte) { +func testInput(t *testing.T, name string, inputBytes []byte, minSparse int64) { currentLen := 1 for { - testInputWithWriteLen(t, inputBytes, currentLen) + t.Run(fmt.Sprintf("%s@%d", name, currentLen), func(t *testing.T) { + testInputWithWriteLen(t, inputBytes, minSparse, currentLen) + }) currentLen <<= 1 if currentLen > len(inputBytes) { break @@ -88,22 +98,30 @@ func testInput(t *testing.T, inputBytes []byte) { } func TestSparseWriter(t *testing.T) { - testInput(t, []byte("hello")) - testInput(t, append(make([]byte, 100), []byte("hello")...)) - testInput(t, []byte("")) + testInput(t, "small contents", []byte("hello"), 0) + testInput(t, "small zeroes", append(make([]byte, 100), []byte("hello")...), 0) + testInput(t, "empty", []byte(""), 0) + testInput(t, "small iterated", []byte{'a', 0, 'a', 0, 'a', 0}, 0) + testInput(t, "small iterated2", []byte{0, 'a', 0, 'a', 0, 'a'}, 0) // add "hello" at the beginning - largeInput := make([]byte, 1024*1024) + const largeSize = 1024 * 1024 + largeInput := make([]byte, largeSize) copy(largeInput, []byte("hello")) - testInput(t, largeInput) + testInput(t, "sparse end", largeInput, largeSize-5-1) // -1 for the final byte establishing file size // add "hello" at the end - largeInput = make([]byte, 1024*1024) - copy(largeInput[1024*1024-5:], []byte("hello")) - testInput(t, largeInput) + largeInput = make([]byte, largeSize) + copy(largeInput[largeSize-5:], []byte("hello")) + testInput(t, "sparse beginning", largeInput, largeSize-5) // add "hello" in the middle - largeInput = make([]byte, 1024*1024) + largeInput = make([]byte, largeSize) copy(largeInput[len(largeInput)/2:], []byte("hello")) - testInput(t, largeInput) + testInput(t, "sparse both ends", largeInput, largeSize-5-1) // -1 for the final byte establishing file size + + largeInput = make([]byte, largeSize) + copy(largeInput[0:5], []byte("hello")) + copy(largeInput[largeSize-5:], []byte("HELLO")) + testInput(t, "sparse middle", largeInput, largeSize-10) } From 5d303ca267fce98be6f2299fbbb50334d7bda159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miloslav=20Trma=C4=8D?= Date: Tue, 20 Feb 2024 20:16:53 +0100 Subject: [PATCH 2/2] Reformulate sparseWriter to deal with starting/ending zeroes explicitly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... instead of using a multi-variable state machine. The net effect of this code is exactly the same as the previous implementation, except: - the operation after Write() returns an error might differ - If the file ends with zeroes, we don't Seek(-1), and we don't create a hole at all if it is too small, preferring to save a syscall. But this formulation is hopefully easier to prove correct. Signed-off-by: Miloslav Trmač --- pkg/machine/compression/sparse_file_writer.go | 177 +++++++++--------- 1 file changed, 91 insertions(+), 86 deletions(-) diff --git a/pkg/machine/compression/sparse_file_writer.go b/pkg/machine/compression/sparse_file_writer.go index d8e071a3ca..c79941587e 100644 --- a/pkg/machine/compression/sparse_file_writer.go +++ b/pkg/machine/compression/sparse_file_writer.go @@ -1,19 +1,12 @@ package compression import ( - "bytes" "errors" + "fmt" "io" ) -type state int - -const ( - zerosThreshold = 1024 - - stateData = iota - stateZeros -) +const zerosThreshold = 1024 type WriteSeekCloser interface { io.Closer @@ -21,91 +14,103 @@ type WriteSeekCloser interface { } type sparseWriter struct { - state state - file WriteSeekCloser - zeros int64 - lastIsZero bool + file WriteSeekCloser + // Invariant between method calls: + // The contents of the file match the contents passed to Write, except that pendingZeroes trailing zeroes have not been written. + // Also, the data that _has_ been written does not end with a zero byte (i.e. pendingZeroes is the largest possible value. + pendingZeroes int64 } func NewSparseWriter(file WriteSeekCloser) *sparseWriter { return &sparseWriter{ - file: file, - state: stateData, - zeros: 0, - lastIsZero: false, + file: file, + pendingZeroes: 0, } } -func (sw *sparseWriter) createHole() error { - zeros := sw.zeros - if zeros == 0 { - return nil - } - sw.zeros = 0 - sw.lastIsZero = true - _, err := sw.file.Seek(zeros, io.SeekCurrent) +func (sw *sparseWriter) createHole(size int64) error { + _, err := sw.file.Seek(size, io.SeekCurrent) return err } -func findFirstNotZero(b []byte) int { - for i, v := range b { - if v != 0 { - return i - } +func zeroSpanEnd(b []byte, i int) int { + for i < len(b) && b[i] == 0 { + i++ + } + return i +} + +func nonzeroSpanEnd(b []byte, i int) int { + for i < len(b) && b[i] != 0 { + i++ } - return -1 + return i } // Write writes data to the file, creating holes for long sequences of zeros. func (sw *sparseWriter) Write(data []byte) (int, error) { - written, current := 0, 0 - totalLen := len(data) - for current < len(data) { - switch sw.state { - case stateData: - nextZero := bytes.IndexByte(data[current:], 0) - if nextZero < 0 { - _, err := sw.file.Write(data[written:]) - sw.lastIsZero = false - return totalLen, err - } else { - current += nextZero - sw.state = stateZeros - } - case stateZeros: - nextNonZero := findFirstNotZero(data[current:]) - if nextNonZero < 0 { - // finish with a zero, flush any data and keep track of the zeros - if written != current { - if _, err := sw.file.Write(data[written:current]); err != nil { - return -1, err - } - sw.lastIsZero = false - } - sw.zeros += int64(len(data) - current) - return totalLen, nil - } - // do not bother with too short sequences - if sw.zeros == 0 && nextNonZero < zerosThreshold { - sw.state = stateData - current += nextNonZero - continue - } - if written != current { - if _, err := sw.file.Write(data[written:current]); err != nil { - return -1, err - } - sw.lastIsZero = false - } - sw.zeros += int64(nextNonZero) - current += nextNonZero - if err := sw.createHole(); err != nil { - return -1, err - } - written = current + initialZeroSpanLength := zeroSpanEnd(data, 0) + if initialZeroSpanLength == len(data) { + sw.pendingZeroes += int64(initialZeroSpanLength) + return initialZeroSpanLength, nil + } + + // We have _some_ non-zero data to write. + // Think of the input as an alternating sequence of spans of zeroes / non-zeroes 0a0b…c0, + // where the starting/ending span of zeroes may be empty. + + pendingWriteOffset := 0 + // The expected condition for creating a hole would be sw.pendingZeroes + initialZeroSpanLength >= zerosThreshold; but + // if sw.pendingZeroes != 0, we are going to spend a syscall to deal with sw.pendingZeroes either way. + // We might just as well make it a createHole(), even if the hole size is below zeroThreshold. + if sw.pendingZeroes != 0 || initialZeroSpanLength >= zerosThreshold { + if err := sw.createHole(sw.pendingZeroes + int64(initialZeroSpanLength)); err != nil { + return -1, err } + // We could set sw.pendingZeroes = 0 now; it would always be overwritten on successful return from this function. + pendingWriteOffset = initialZeroSpanLength + } + + current := initialZeroSpanLength + for { + // Invariant at this point of this loop: + // - pendingWriteOffset <= current < len(data) + // - data[current] != 0 + // - data[pendingWriteOffset:current] has not yet been written + if pendingWriteOffset > current || current >= len(data) { + return -1, fmt.Errorf("internal error: sparseWriter invariant violation: %d <= %d < %d", pendingWriteOffset, current, len(data)) + } + if b := data[current]; b == 0 { + return -1, fmt.Errorf("internal error: sparseWriter invariant violation: %d@%d", b, current) + } + + nonzeroSpanEnd := nonzeroSpanEnd(data, current) + if nonzeroSpanEnd == current { + return -1, fmt.Errorf("internal error: sparseWriter’s nonzeroSpanEnd didn’t advance") + } + zeroSpanEnd := zeroSpanEnd(data, nonzeroSpanEnd) // possibly == nonzeroSpanEnd + zeroSpanLength := zeroSpanEnd - nonzeroSpanEnd + if zeroSpanEnd < len(data) && zeroSpanLength < zerosThreshold { + // Too small a hole, keep going + current = zeroSpanEnd + continue + } + + // We have either reached the end, or found an interesting hole. Issue a write. + if _, err := sw.file.Write(data[pendingWriteOffset:nonzeroSpanEnd]); err != nil { + return -1, err + } + if zeroSpanEnd == len(data) { + sw.pendingZeroes = int64(zeroSpanLength) + return zeroSpanEnd, nil + } + + if err := sw.createHole(int64(zeroSpanLength)); err != nil { + return -1, err + } + pendingWriteOffset = zeroSpanEnd + current = zeroSpanEnd } - return totalLen, nil } // Close closes the SparseWriter's underlying file. @@ -113,16 +118,16 @@ func (sw *sparseWriter) Close() error { if sw.file == nil { return errors.New("file is already closed") } - if err := sw.createHole(); err != nil { - sw.file.Close() - return err - } - if sw.lastIsZero { - if _, err := sw.file.Seek(-1, io.SeekCurrent); err != nil { - sw.file.Close() - return err + if sw.pendingZeroes != 0 { + if holeSize := sw.pendingZeroes - 1; holeSize >= zerosThreshold { + if err := sw.createHole(holeSize); err != nil { + sw.file.Close() + return err + } + sw.pendingZeroes -= holeSize } - if _, err := sw.file.Write([]byte{0}); err != nil { + var zeroArray [zerosThreshold]byte + if _, err := sw.file.Write(zeroArray[:sw.pendingZeroes]); err != nil { sw.file.Close() return err }