-
Notifications
You must be signed in to change notification settings - Fork 4.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
storage/raft: Fix memory allocation issue and Metadata tracking issues with snapshots #8793
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -88,6 +88,10 @@ type FSM struct { | |
storeLatestState bool | ||
|
||
chunker *raftchunking.ChunkingBatchingFSM | ||
|
||
// testSnapshotRestoreError is used in tests to simulate an error while | ||
// restoring a snapshot. | ||
testSnapshotRestoreError bool | ||
} | ||
|
||
// NewFSM constructs a FSM using the given directory | ||
|
@@ -193,20 +197,20 @@ func (f *FSM) witnessIndex(i *IndexValue) { | |
} | ||
} | ||
|
||
func (f *FSM) witnessSnapshot(index, term, configurationIndex uint64, configuration raft.Configuration) error { | ||
func (f *FSM) witnessSnapshot(metadata *raft.SnapshotMeta) error { | ||
var indexBytes []byte | ||
latestIndex, _ := f.LatestState() | ||
|
||
latestIndex.Index = index | ||
latestIndex.Term = term | ||
latestIndex.Index = metadata.Index | ||
latestIndex.Term = metadata.Term | ||
|
||
var err error | ||
indexBytes, err = proto.Marshal(latestIndex) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
protoConfig := raftConfigurationToProtoConfiguration(configurationIndex, configuration) | ||
protoConfig := raftConfigurationToProtoConfiguration(metadata.ConfigurationIndex, metadata.Configuration) | ||
configBytes, err := proto.Marshal(protoConfig) | ||
if err != nil { | ||
return err | ||
|
@@ -232,16 +236,16 @@ func (f *FSM) witnessSnapshot(index, term, configurationIndex uint64, configurat | |
} | ||
} | ||
|
||
atomic.StoreUint64(f.latestIndex, index) | ||
atomic.StoreUint64(f.latestTerm, term) | ||
atomic.StoreUint64(f.latestIndex, metadata.Index) | ||
atomic.StoreUint64(f.latestTerm, metadata.Term) | ||
f.latestConfig.Store(protoConfig) | ||
|
||
return nil | ||
} | ||
|
||
// Delete deletes the given key from the bolt file. | ||
func (f *FSM) Delete(ctx context.Context, path string) error { | ||
defer metrics.MeasureSince([]string{"raft", "delete"}, time.Now()) | ||
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "delete"}, time.Now()) | ||
|
||
f.l.RLock() | ||
defer f.l.RUnlock() | ||
|
@@ -253,7 +257,7 @@ func (f *FSM) Delete(ctx context.Context, path string) error { | |
|
||
// Delete deletes the given key from the bolt file. | ||
func (f *FSM) DeletePrefix(ctx context.Context, prefix string) error { | ||
defer metrics.MeasureSince([]string{"raft", "delete_prefix"}, time.Now()) | ||
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "delete_prefix"}, time.Now()) | ||
|
||
f.l.RLock() | ||
defer f.l.RUnlock() | ||
|
@@ -277,7 +281,7 @@ func (f *FSM) DeletePrefix(ctx context.Context, prefix string) error { | |
|
||
// Get retrieves the value at the given path from the bolt file. | ||
func (f *FSM) Get(ctx context.Context, path string) (*physical.Entry, error) { | ||
defer metrics.MeasureSince([]string{"raft", "get"}, time.Now()) | ||
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "get"}, time.Now()) | ||
|
||
f.l.RLock() | ||
defer f.l.RUnlock() | ||
|
@@ -311,7 +315,7 @@ func (f *FSM) Get(ctx context.Context, path string) (*physical.Entry, error) { | |
|
||
// Put writes the given entry to the bolt file. | ||
func (f *FSM) Put(ctx context.Context, entry *physical.Entry) error { | ||
defer metrics.MeasureSince([]string{"raft", "put"}, time.Now()) | ||
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "put"}, time.Now()) | ||
|
||
f.l.RLock() | ||
defer f.l.RUnlock() | ||
|
@@ -324,7 +328,7 @@ func (f *FSM) Put(ctx context.Context, entry *physical.Entry) error { | |
|
||
// List retrieves the set of keys with the given prefix from the bolt file. | ||
func (f *FSM) List(ctx context.Context, prefix string) ([]string, error) { | ||
defer metrics.MeasureSince([]string{"raft", "list"}, time.Now()) | ||
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "list"}, time.Now()) | ||
|
||
f.l.RLock() | ||
defer f.l.RUnlock() | ||
|
@@ -531,6 +535,8 @@ type writeErrorCloser interface { | |
// (size, checksum, etc) and a second for the sink of the data. We also use a | ||
// proto delimited writer so we can stream proto messages to the sink. | ||
func (f *FSM) writeTo(ctx context.Context, metaSink writeErrorCloser, sink writeErrorCloser) { | ||
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "write_snapshot"}, time.Now()) | ||
|
||
protoWriter := protoio.NewDelimitedWriter(sink) | ||
metadataProtoWriter := protoio.NewDelimitedWriter(metaSink) | ||
|
||
|
@@ -573,7 +579,9 @@ func (f *FSM) writeTo(ctx context.Context, metaSink writeErrorCloser, sink write | |
|
||
// Snapshot implements the FSM interface. It returns a noop snapshot object. | ||
func (f *FSM) Snapshot() (raft.FSMSnapshot, error) { | ||
return &noopSnapshotter{}, nil | ||
return &noopSnapshotter{ | ||
fsm: f, | ||
}, nil | ||
} | ||
|
||
// SetNoopRestore is used to disable restore operations on raft startup. Because | ||
|
@@ -589,48 +597,91 @@ func (f *FSM) SetNoopRestore(enabled bool) { | |
// first deletes the existing bucket to clear all existing data, then recreates | ||
// it so we can copy in the snapshot. | ||
func (f *FSM) Restore(r io.ReadCloser) error { | ||
defer metrics.MeasureSince([]string{"raft_storage", "fsm", "restore_snapshot"}, time.Now()) | ||
|
||
if f.noopRestore == true { | ||
return nil | ||
} | ||
|
||
snapMeta := r.(*boltSnapshotMetadataReader).Metadata() | ||
|
||
protoReader := protoio.NewDelimitedReader(r, math.MaxInt32) | ||
defer protoReader.Close() | ||
|
||
f.l.Lock() | ||
defer f.l.Unlock() | ||
|
||
// Start a write transaction. | ||
// Delete the existing data bucket and create a new one. | ||
f.logger.Debug("snapshot restore: deleting bucket") | ||
err := f.db.Update(func(tx *bolt.Tx) error { | ||
err := tx.DeleteBucket(dataBucketName) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
b, err := tx.CreateBucket(dataBucketName) | ||
_, err = tx.CreateBucket(dataBucketName) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for { | ||
return nil | ||
}) | ||
if err != nil { | ||
f.logger.Error("could not restore snapshot: could not clear existing bucket", "error", err) | ||
return err | ||
} | ||
|
||
// If we are testing a failed snapshot error here. | ||
if f.testSnapshotRestoreError { | ||
return errors.New("Test error") | ||
} | ||
|
||
f.logger.Debug("snapshot restore: deleting bucket done") | ||
f.logger.Debug("snapshot restore: writing keys") | ||
|
||
var done bool | ||
var keys int | ||
for !done { | ||
err := f.db.Update(func(tx *bolt.Tx) error { | ||
b := tx.Bucket(dataBucketName) | ||
s := new(pb.StorageEntry) | ||
err := protoReader.ReadMsg(s) | ||
if err != nil { | ||
if err == io.EOF { | ||
return nil | ||
|
||
// Commit in batches of 50k. Bolt holds all the data in memory and | ||
// doesn't split the pages until commit so we do incremental writes. | ||
// This is safe since we have a write lock on the fsm's lock. | ||
for i := 0; i < 50000; i++ { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mostly curious here, but how was 50k chosen at the batch value to use? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The boltdb docs say we shouldn't go over 100,000k items in a batch update. So i just went a good bit below that |
||
err := protoReader.ReadMsg(s) | ||
if err != nil { | ||
if err == io.EOF { | ||
done = true | ||
return nil | ||
} | ||
return err | ||
} | ||
return err | ||
} | ||
|
||
err = b.Put([]byte(s.Key), s.Value) | ||
if err != nil { | ||
return err | ||
err = b.Put([]byte(s.Key), s.Value) | ||
if err != nil { | ||
return err | ||
} | ||
keys += 1 | ||
} | ||
|
||
return nil | ||
}) | ||
if err != nil { | ||
f.logger.Error("could not restore snapshot", "error", err) | ||
return err | ||
} | ||
|
||
return nil | ||
}) | ||
if err != nil { | ||
f.logger.Error("could not restore snapshot", "error", err) | ||
f.logger.Trace("snapshot restore: writing keys", "num_written", keys) | ||
} | ||
|
||
f.logger.Debug("snapshot restore: writing keys done") | ||
|
||
// Write the metadata after we have applied all the snapshot data | ||
f.logger.Debug("snapshot restore: writing metadata") | ||
if err := f.witnessSnapshot(snapMeta); err != nil { | ||
f.logger.Error("could not write metadata", "error", err) | ||
return err | ||
} | ||
|
||
|
@@ -639,10 +690,23 @@ func (f *FSM) Restore(r io.ReadCloser) error { | |
|
||
// noopSnapshotter implements the fsm.Snapshot interface. It doesn't do anything | ||
// since our SnapshotStore reads data out of the FSM on Open(). | ||
type noopSnapshotter struct{} | ||
type noopSnapshotter struct { | ||
fsm *FSM | ||
} | ||
|
||
// Persist doesn't do anything. | ||
// Persist implements the fsm.Snapshot interface. It doesn't need to persist any | ||
// state data, but it does persist the raft metadata. This is necessary so we | ||
// can be sure to capture indexes for operation types that are not sent to the | ||
// FSM. | ||
func (s *noopSnapshotter) Persist(sink raft.SnapshotSink) error { | ||
boltSnapshotSink := sink.(*BoltSnapshotSink) | ||
|
||
// We are processing a snapshot, fastforward the index, term, and | ||
// configuration to the latest seen by the raft system. | ||
if err := s.fsm.witnessSnapshot(&boltSnapshotSink.meta); err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these renames seems like a breaking change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes you're correct, however the FSM's Delete, DeletePrefix, and Put functions aren't called during normal vault operation. So these shouldn't have been emitted. Get and List will be a change, but there is already another function that emits these same metrics with more accuracy. Really the name here is a bug since it's scoping the metric to an incorrect system.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I went ahead and put the metrics for Get and List back so as not to break any existing dashboards. We are now emitting both the old and new names.