diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f1f8f56bf..5ea1e300c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ Changelog for NeoFS Node ### Fixed +### Changed +- FSTree storage now uses more efficient and safe temporary files under Linux (#2566) + ### Removed ### Updated diff --git a/pkg/local_object_storage/blobstor/bench_test.go b/pkg/local_object_storage/blobstor/bench_test.go new file mode 100644 index 0000000000..24cad0300c --- /dev/null +++ b/pkg/local_object_storage/blobstor/bench_test.go @@ -0,0 +1,139 @@ +package blobstor_test + +import ( + "crypto/rand" + "fmt" + "path/filepath" + "sync" + "testing" + "time" + + bbczt "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/blobovniczatree" + "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common" + "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/fstree" + "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/peapod" + oidtest "github.com/nspcc-dev/neofs-sdk-go/object/id/test" + "github.com/stretchr/testify/require" +) + +func benchmarkPutMN(b *testing.B, depth, width uint64, parallel bool) { + nBlobovniczas := uint64(1) + for i := uint64(1); i <= depth+1; i++ { + nBlobovniczas *= width + } + + const objSizeLimit = 4 << 10 + const fullSizeLimit = 100 << 20 + + bbcz := bbczt.NewBlobovniczaTree( + bbczt.WithRootPath(b.TempDir()), + bbczt.WithObjectSizeLimit(objSizeLimit), + bbczt.WithBlobovniczaSize(fullSizeLimit/nBlobovniczas), + bbczt.WithBlobovniczaShallowWidth(width), + bbczt.WithBlobovniczaShallowDepth(depth), + ) + + require.NoError(b, bbcz.Open(false)) + b.Cleanup(func() { _ = bbcz.Close() }) + require.NoError(b, bbcz.Init()) + + benchmark(b, bbcz, objSizeLimit, 20) +} + +func BenchmarkBlobovniczas_Put(b *testing.B) { + for _, testCase := range []struct { + width, depth uint64 + }{ + {1, 0}, + {10, 0}, + {2, 2}, + {4, 4}, + } { + b.Run(fmt.Sprintf("tree=%dx%d", testCase.width, testCase.depth), func(b *testing.B) { + benchmarkPutMN(b, testCase.depth, testCase.width, false) + }) + b.Run(fmt.Sprintf("tree=%dx%d_parallel", testCase.width, testCase.depth), func(b *testing.B) { + benchmarkPutMN(b, testCase.depth, testCase.width, true) + }) + } +} + +func testPeapodPath(tb testing.TB) string { + return filepath.Join(tb.TempDir(), "peapod.db") +} + +func newTestPeapod(tb testing.TB) common.Storage { + return peapod.New(testPeapodPath(tb), 0600, 10*time.Millisecond) +} + +func newTestFSTree(tb testing.TB) common.Storage { + return fstree.New( + fstree.WithDepth(4), // Default. + fstree.WithPath(tb.TempDir()), + fstree.WithDirNameLen(1), // Default. + fstree.WithNoSync(false), // Default. + ) +} + +func benchmark(b *testing.B, p common.Storage, objSize uint64, nThreads int) { + data := make([]byte, objSize) + rand.Read(data) + + prm := common.PutPrm{ + RawData: data, + } + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + var wg sync.WaitGroup + + for i := 0; i < nThreads; i++ { + wg.Add(1) + go func() { + defer wg.Done() + + prm := prm + prm.Address = oidtest.Address() + + _, err := p.Put(prm) + require.NoError(b, err) + }() + } + + wg.Wait() + } +} + +func BenchmarkPut(b *testing.B) { + for _, tc := range []struct { + objSize uint64 + nThreads int + }{ + {1, 1}, + {1, 20}, + {1, 100}, + {1 << 10, 1}, + {1 << 10, 20}, + {1 << 10, 100}, + {100 << 10, 1}, + {100 << 10, 20}, + {100 << 10, 100}, + } { + b.Run(fmt.Sprintf("size=%d,thread=%d", tc.objSize, tc.nThreads), func(b *testing.B) { + for name, creat := range map[string]func(testing.TB) common.Storage{ + "peapod": newTestPeapod, + "fstree": newTestFSTree, + } { + b.Run(name, func(b *testing.B) { + ptt := creat(b) + require.NoError(b, ptt.Open(false)) + require.NoError(b, ptt.Init()) + b.Cleanup(func() { _ = ptt.Close() }) + + benchmark(b, ptt, tc.objSize, tc.nThreads) + }) + } + }) + } +} diff --git a/pkg/local_object_storage/blobstor/blobovniczatree/put_test.go b/pkg/local_object_storage/blobstor/blobovniczatree/put_test.go index a5eea60c7f..16769dc09f 100644 --- a/pkg/local_object_storage/blobstor/blobovniczatree/put_test.go +++ b/pkg/local_object_storage/blobstor/blobovniczatree/put_test.go @@ -1,10 +1,6 @@ package blobovniczatree_test import ( - "crypto/rand" - "errors" - "fmt" - "sync" "testing" . "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/blobovniczatree" @@ -42,84 +38,3 @@ func TestSingleDir(t *testing.T) { }) require.NoError(t, err) } - -func benchmarkPutMN(b *testing.B, depth, width uint64, parallel bool) { - nBlobovniczas := uint64(1) - for i := uint64(1); i <= depth+1; i++ { - nBlobovniczas *= width - } - - const objSizeLimit = 4 << 10 - const fullSizeLimit = 100 << 20 - - bbcz := NewBlobovniczaTree( - WithRootPath(b.TempDir()), - WithObjectSizeLimit(objSizeLimit), - WithBlobovniczaSize(fullSizeLimit/nBlobovniczas), - WithBlobovniczaShallowWidth(width), - WithBlobovniczaShallowDepth(depth), - ) - - require.NoError(b, bbcz.Open(false)) - b.Cleanup(func() { _ = bbcz.Close() }) - require.NoError(b, bbcz.Init()) - - prm := common.PutPrm{ - RawData: make([]byte, objSizeLimit), - } - - rand.Read(prm.RawData) - - var wg sync.WaitGroup - - f := func(prm common.PutPrm) { - defer wg.Done() - - var err error - - for i := 0; i < b.N; i++ { - prm.Address = oidtest.Address() - - _, err = bbcz.Put(prm) - if err != nil { - if errors.Is(err, common.ErrNoSpace) { - break - } - require.NoError(b, err) - } - } - } - - nRoutine := 1 - if parallel { - nRoutine = 20 - } - - b.ReportAllocs() - b.ResetTimer() - - for j := 0; j < nRoutine; j++ { - wg.Add(1) - go f(prm) - } - - wg.Wait() -} - -func BenchmarkBlobovniczas_Put(b *testing.B) { - for _, testCase := range []struct { - width, depth uint64 - }{ - {1, 0}, - {10, 0}, - {2, 2}, - {4, 4}, - } { - b.Run(fmt.Sprintf("tree=%dx%d", testCase.width, testCase.depth), func(b *testing.B) { - benchmarkPutMN(b, testCase.depth, testCase.width, false) - }) - b.Run(fmt.Sprintf("tree=%dx%d_parallel", testCase.width, testCase.depth), func(b *testing.B) { - benchmarkPutMN(b, testCase.depth, testCase.width, true) - }) - } -} diff --git a/pkg/local_object_storage/blobstor/fstree/fstree.go b/pkg/local_object_storage/blobstor/fstree/fstree.go index 3d254443ea..911407f81f 100644 --- a/pkg/local_object_storage/blobstor/fstree/fstree.go +++ b/pkg/local_object_storage/blobstor/fstree/fstree.go @@ -7,9 +7,7 @@ import ( "io/fs" "os" "path/filepath" - "strconv" "strings" - "syscall" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/compression" @@ -247,103 +245,7 @@ func (t *FSTree) Put(prm common.PutPrm) (common.PutRes, error) { if !prm.DontCompress { prm.RawData = t.Compress(prm.RawData) } - - // Here is a situation: - // Feb 09 13:10:37 buky neofs-node[32445]: 2023-02-09T13:10:37.161Z info log/log.go:13 local object storage operation {"shard_id": "SkT8BfjouW6t93oLuzQ79s", "address": "7NxFz4SruSi8TqXacr2Ae22nekMhgYk1sfkddJo9PpWk/5enyUJGCyU1sfrURDnHEjZFdbGqANVhayYGfdSqtA6wA", "op": "PUT", "type": "fstree", "storage_id": ""} - // Feb 09 13:10:37 buky neofs-node[32445]: 2023-02-09T13:10:37.183Z info log/log.go:13 local object storage operation {"shard_id": "SkT8BfjouW6t93oLuzQ79s", "address": "7NxFz4SruSi8TqXacr2Ae22nekMhgYk1sfkddJo9PpWk/5enyUJGCyU1sfrURDnHEjZFdbGqANVhayYGfdSqtA6wA", "op": "metabase PUT"} - // Feb 09 13:10:37 buky neofs-node[32445]: 2023-02-09T13:10:37.862Z debug policer/check.go:231 shortage of object copies detected {"component": "Object Policer", "object": "7NxFz4SruSi8TqXacr2Ae22nekMhgYk1sfkddJo9PpWk/5enyUJGCyU1sfrURDnHEjZFdbGqANVhayYGfdSqtA6wA", "shortage": 1} - // Feb 09 13:10:37 buky neofs-node[32445]: 2023-02-09T13:10:37.862Z debug shard/get.go:124 object is missing in write-cache {"shard_id": "SkT8BfjouW6t93oLuzQ79s", "addr": "7NxFz4SruSi8TqXacr2Ae22nekMhgYk1sfkddJo9PpWk/5enyUJGCyU1sfrURDnHEjZFdbGqANVhayYGfdSqtA6wA", "skip_meta": false} - // - // 1. We put an object on node 1. - // 2. Relentless policer sees that it has only 1 copy and tries to PUT it to node 2. - // 3. PUT operation started by client at (1) also puts an object here. - // 4. Now we have concurrent writes and one of `Rename` calls will return `no such file` error. - // Even more than that, concurrent writes can corrupt data. - // - // So here is a solution: - // 1. Write a file to 'name + 1'. - // 2. If it exists, retry with temporary name being 'name + 2'. - // 3. Set some reasonable number of attempts. - // - // It is a bit kludgey, but I am unusually proud about having found this out after - // hours of research on linux kernel, dirsync mount option and ext4 FS, turned out - // to be so hecking simple. - // In a very rare situation we can have multiple partially written copies on disk, - // this will be fixed in another issue (we should remove garbage on start). - const retryCount = 5 - for i := 0; i < retryCount; i++ { - tmpPath := p + "#" + strconv.FormatUint(uint64(i), 10) - err := t.writeAndRename(tmpPath, p, prm.RawData) - if err != syscall.EEXIST || i == retryCount-1 { - return common.PutRes{StorageID: []byte{}}, err - } - } - - // unreachable, but precaution never hurts, especially 1 day before release. - return common.PutRes{StorageID: []byte{}}, fmt.Errorf("couldn't read file after %d retries", retryCount) -} - -// writeAndRename opens tmpPath exclusively, writes data to it and renames it to p. -func (t *FSTree) writeAndRename(tmpPath, p string, data []byte) error { - err := t.writeFile(tmpPath, data) - if err != nil { - var pe *fs.PathError - if errors.As(err, &pe) { - switch pe.Err { - case syscall.ENOSPC: - err = common.ErrNoSpace - _ = os.RemoveAll(tmpPath) - case syscall.EEXIST: - return syscall.EEXIST - } - } - } else { - err = os.Rename(tmpPath, p) - } - return err -} - -func (t *FSTree) writeFlags() int { - flags := os.O_WRONLY | os.O_CREATE | os.O_TRUNC | os.O_EXCL - if t.noSync { - return flags - } - return flags | os.O_SYNC -} - -// writeFile writes data to a file with path p. -// The code is copied from `os.WriteFile` with minor corrections for flags. -func (t *FSTree) writeFile(p string, data []byte) error { - f, err := os.OpenFile(p, t.writeFlags(), t.Permissions) - if err != nil { - return err - } - _, err = f.Write(data) - if err1 := f.Close(); err1 != nil && err == nil { - err = err1 - } - return err -} - -// PutStream puts executes handler on a file opened for write. -func (t *FSTree) PutStream(addr oid.Address, handler func(*os.File) error) error { - if t.readOnly { - return common.ErrReadOnly - } - - p := t.treePath(addr) - - if err := util.MkdirAllX(filepath.Dir(p), t.Permissions); err != nil { - return err - } - - f, err := os.OpenFile(p, t.writeFlags(), t.Permissions) - if err != nil { - return err - } - defer f.Close() - - return handler(f) + return common.PutRes{StorageID: []byte{}}, t.writeData(p, prm.RawData) } // Get returns an object from the storage by address. diff --git a/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go b/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go new file mode 100644 index 0000000000..4e891be68d --- /dev/null +++ b/pkg/local_object_storage/blobstor/fstree/fstree_write_generic.go @@ -0,0 +1,92 @@ +//go:build !linux + +package fstree + +import ( + "errors" + "fmt" + "io/fs" + "os" + "strconv" + "syscall" + + "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common" +) + +func (t *FSTree) writeData(p string, data []byte) error { + // Here is a situation: + // Feb 09 13:10:37 buky neofs-node[32445]: 2023-02-09T13:10:37.161Z info log/log.go:13 local object storage operation {"shard_id": "SkT8BfjouW6t93oLuzQ79s", "address": "7NxFz4SruSi8TqXacr2Ae22nekMhgYk1sfkddJo9PpWk/5enyUJGCyU1sfrURDnHEjZFdbGqANVhayYGfdSqtA6wA", "op": "PUT", "type": "fstree", "storage_id": ""} + // Feb 09 13:10:37 buky neofs-node[32445]: 2023-02-09T13:10:37.183Z info log/log.go:13 local object storage operation {"shard_id": "SkT8BfjouW6t93oLuzQ79s", "address": "7NxFz4SruSi8TqXacr2Ae22nekMhgYk1sfkddJo9PpWk/5enyUJGCyU1sfrURDnHEjZFdbGqANVhayYGfdSqtA6wA", "op": "metabase PUT"} + // Feb 09 13:10:37 buky neofs-node[32445]: 2023-02-09T13:10:37.862Z debug policer/check.go:231 shortage of object copies detected {"component": "Object Policer", "object": "7NxFz4SruSi8TqXacr2Ae22nekMhgYk1sfkddJo9PpWk/5enyUJGCyU1sfrURDnHEjZFdbGqANVhayYGfdSqtA6wA", "shortage": 1} + // Feb 09 13:10:37 buky neofs-node[32445]: 2023-02-09T13:10:37.862Z debug shard/get.go:124 object is missing in write-cache {"shard_id": "SkT8BfjouW6t93oLuzQ79s", "addr": "7NxFz4SruSi8TqXacr2Ae22nekMhgYk1sfkddJo9PpWk/5enyUJGCyU1sfrURDnHEjZFdbGqANVhayYGfdSqtA6wA", "skip_meta": false} + // + // 1. We put an object on node 1. + // 2. Relentless policer sees that it has only 1 copy and tries to PUT it to node 2. + // 3. PUT operation started by client at (1) also puts an object here. + // 4. Now we have concurrent writes and one of `Rename` calls will return `no such file` error. + // Even more than that, concurrent writes can corrupt data. + // + // So here is a solution: + // 1. Write a file to 'name + 1'. + // 2. If it exists, retry with temporary name being 'name + 2'. + // 3. Set some reasonable number of attempts. + // + // It is a bit kludgey, but I am unusually proud about having found this out after + // hours of research on linux kernel, dirsync mount option and ext4 FS, turned out + // to be so hecking simple. + // In a very rare situation we can have multiple partially written copies on disk, + // this will be fixed in another issue (we should remove garbage on start). + const retryCount = 5 + for i := 0; i < retryCount; i++ { + tmpPath := p + "#" + strconv.FormatUint(uint64(i), 10) + err := t.writeAndRename(tmpPath, p, data) + if err != syscall.EEXIST || i == retryCount-1 { + return err + } + } + + // unreachable, but precaution never hurts, especially 1 day before release. + return fmt.Errorf("couldn't read file after %d retries", retryCount) +} + +// writeAndRename opens tmpPath exclusively, writes data to it and renames it to p. +func (t *FSTree) writeAndRename(tmpPath, p string, data []byte) error { + err := t.writeFile(tmpPath, data) + if err != nil { + var pe *fs.PathError + if errors.As(err, &pe) { + switch pe.Err { + case syscall.ENOSPC: + err = common.ErrNoSpace + _ = os.RemoveAll(tmpPath) + case syscall.EEXIST: + return syscall.EEXIST + } + } + } else { + err = os.Rename(tmpPath, p) + } + return err +} + +func (t *FSTree) writeFlags() int { + flags := os.O_WRONLY | os.O_CREATE | os.O_TRUNC | os.O_EXCL + if t.noSync { + return flags + } + return flags | os.O_SYNC +} + +// writeFile writes data to a file with path p. +// The code is copied from `os.WriteFile` with minor corrections for flags. +func (t *FSTree) writeFile(p string, data []byte) error { + f, err := os.OpenFile(p, t.writeFlags(), t.Permissions) + if err != nil { + return err + } + _, err = f.Write(data) + if err1 := f.Close(); err1 != nil && err == nil { + err = err1 + } + return err +} diff --git a/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go b/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go new file mode 100644 index 0000000000..0f8e598edd --- /dev/null +++ b/pkg/local_object_storage/blobstor/fstree/fstree_write_linux.go @@ -0,0 +1,48 @@ +//go:build linux + +package fstree + +import ( + "errors" + "strconv" + + "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common" + "golang.org/x/sys/unix" +) + +func (t *FSTree) writeData(p string, data []byte) error { + err := t.writeFile(p, data) + if errors.Is(err, unix.ENOSPC) { + return common.ErrNoSpace + } + return err +} + +func (t *FSTree) writeFile(p string, data []byte) error { + flags := unix.O_WRONLY | unix.O_TMPFILE | unix.O_CLOEXEC + if !t.noSync { + flags |= unix.O_DSYNC + } + fd, err := unix.Open(t.RootPath, flags, uint32(t.Permissions)) + if err != nil { + return err + } + tmpPath := "/proc/self/fd/" + strconv.FormatUint(uint64(fd), 10) + n, err := unix.Write(fd, data) + if err == nil { + if n == len(data) { + err = unix.Linkat(unix.AT_FDCWD, tmpPath, unix.AT_FDCWD, p, unix.AT_SYMLINK_FOLLOW) + if errors.Is(err, unix.EEXIST) { + // https://github.com/nspcc-dev/neofs-node/issues/2563 + err = nil + } + } else { + err = errors.New("incomplete write") + } + } + errClose := unix.Close(fd) + if err != nil { + return err // Close() error is ignored, we have a better one. + } + return errClose +} diff --git a/pkg/local_object_storage/blobstor/peapod/peapod_test.go b/pkg/local_object_storage/blobstor/peapod/peapod_test.go index d31ac989af..52f0a4ea90 100644 --- a/pkg/local_object_storage/blobstor/peapod/peapod_test.go +++ b/pkg/local_object_storage/blobstor/peapod/peapod_test.go @@ -1,10 +1,7 @@ package peapod_test import ( - "crypto/rand" - "fmt" "path/filepath" - "sync" "testing" "time" @@ -245,60 +242,6 @@ func TestPeapod_Delete(t *testing.T) { }) } -func benchmark(b *testing.B, ppd *peapod.Peapod, objSize uint64, nThreads int) { - data := make([]byte, objSize) - rand.Read(data) - - prm := common.PutPrm{ - RawData: data, - } - - b.ReportAllocs() - b.ResetTimer() - - for i := 0; i < b.N; i++ { - var wg sync.WaitGroup - - for i := 0; i < nThreads; i++ { - wg.Add(1) - go func() { - defer wg.Done() - - prm := prm - prm.Address = oidtest.Address() - - _, err := ppd.Put(prm) - require.NoError(b, err) - }() - } - - wg.Wait() - } -} - -func BenchmarkPeapod_Put(b *testing.B) { - ppd := newTestPeapod(b) - - for _, tc := range []struct { - objSize uint64 - nThreads int - }{ - {1, 1}, - {1, 20}, - {1, 100}, - {1 << 10, 1}, - {1 << 10, 20}, - {1 << 10, 100}, - {100 << 10, 1}, - {100 << 10, 20}, - {100 << 10, 100}, - } { - b.Run(fmt.Sprintf("size=%d,thread=%d", tc.objSize, tc.nThreads), func(b *testing.B) { - benchmark(b, ppd, tc.objSize, tc.nThreads) - }) - } -} - func TestPeapod_IterateAddresses(t *testing.T) { ppd := newTestPeapod(t) diff --git a/pkg/local_object_storage/shard/control_test.go b/pkg/local_object_storage/shard/control_test.go index 326a184c73..b95daabd4b 100644 --- a/pkg/local_object_storage/shard/control_test.go +++ b/pkg/local_object_storage/shard/control_test.go @@ -117,6 +117,9 @@ func TestRefillMetabaseCorrupted(t *testing.T) { require.NoError(t, sh.Close()) addr := object.AddressOf(&obj) + // https://github.com/nspcc-dev/neofs-node/issues/2563 + _, err = fsTree.Delete(common.DeletePrm{Address: addr}) + require.NoError(t, err) _, err = fsTree.Put(common.PutPrm{Address: addr, RawData: []byte("not an object")}) require.NoError(t, err)