-
Notifications
You must be signed in to change notification settings - Fork 14.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
KAFKA-15046: Get rid of unnecessary fsyncs inside UnifiedLog.lock to stabilize performance #14242
Changes from 5 commits
184b031
b3d53e3
136b5a9
165ec5e
8691c87
7e40de4
6be4d05
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,18 +72,20 @@ public CheckpointFile(File file, | |
tempPath = Paths.get(absolutePath + ".tmp"); | ||
} | ||
|
||
public void write(Collection<T> entries) throws IOException { | ||
public void write(Collection<T> entries, boolean sync) throws IOException { | ||
synchronized (lock) { | ||
// write to temp file and then swap with the existing file | ||
try (FileOutputStream fileOutputStream = new FileOutputStream(tempPath.toFile()); | ||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fileOutputStream, StandardCharsets.UTF_8))) { | ||
CheckpointWriteBuffer<T> checkpointWriteBuffer = new CheckpointWriteBuffer<>(writer, version, formatter); | ||
checkpointWriteBuffer.write(entries); | ||
writer.flush(); | ||
fileOutputStream.getFD().sync(); | ||
if (sync) { | ||
fileOutputStream.getFD().sync(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ocadaruma : I realized a potential issue with this change. The issue is that if sync is false, we don't force a flush to disk. However, the OS could flush partial content of the leader epoch file. If the broker has a hard failure, the leader epoch file could be corrupted. In the recovery path, since we always expect the leader epoch file to be well-formed, a corrupted leader epoch file will fail the recovery. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @junrao Hmm, that's true. Thanks for pointing out. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, @ocadaruma ! |
||
} | ||
} | ||
|
||
Utils.atomicMoveWithFallback(tempPath, absolutePath); | ||
Utils.atomicMoveWithFallback(tempPath, absolutePath, sync); | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,7 +72,7 @@ public void assign(int epoch, long startOffset) { | |
EpochEntry entry = new EpochEntry(epoch, startOffset); | ||
if (assign(entry)) { | ||
log.debug("Appended new epoch entry {}. Cache now contains {} entries.", entry, epochs.size()); | ||
flush(); | ||
writeToFile(true); | ||
} | ||
} | ||
|
||
|
@@ -82,7 +82,7 @@ public void assign(List<EpochEntry> entries) { | |
log.debug("Appended new epoch entry {}. Cache now contains {} entries.", entry, epochs.size()); | ||
} | ||
}); | ||
if (!entries.isEmpty()) flush(); | ||
if (!entries.isEmpty()) writeToFile(true); | ||
} | ||
|
||
private boolean isUpdateNeeded(EpochEntry entry) { | ||
|
@@ -151,11 +151,6 @@ private List<EpochEntry> removeWhileMatching(Iterator<Map.Entry<Integer, EpochEn | |
return removedEpochs; | ||
} | ||
|
||
public LeaderEpochFileCache cloneWithLeaderEpochCheckpoint(LeaderEpochCheckpoint leaderEpochCheckpoint) { | ||
flushTo(leaderEpochCheckpoint); | ||
return new LeaderEpochFileCache(this.topicPartition, leaderEpochCheckpoint); | ||
} | ||
|
||
public boolean nonEmpty() { | ||
lock.readLock().lock(); | ||
try { | ||
|
@@ -308,7 +303,14 @@ public void truncateFromEnd(long endOffset) { | |
if (endOffset >= 0 && epochEntry.isPresent() && epochEntry.get().startOffset >= endOffset) { | ||
List<EpochEntry> removedEntries = removeFromEnd(x -> x.startOffset >= endOffset); | ||
|
||
flush(); | ||
// We intentionally don't force flushing change to the device here because: | ||
// - To avoid fsync latency | ||
// * fsync latency could be huge on a disk glitch, which is not rare in spinning drives | ||
// * This method is called by ReplicaFetcher threads, which could block replica fetching | ||
// then causing ISR shrink or high produce response time degradation in remote scope on high fsync latency. | ||
// - Even when stale epochs remained in LeaderEpoch file due to the unclean shutdown, it will be handled by | ||
// another truncateFromEnd call on log loading procedure so it won't be a problem | ||
writeToFile(false); | ||
|
||
log.debug("Cleared entries {} from epoch cache after truncating to end offset {}, leaving {} entries in the cache.", removedEntries, endOffset, epochs.size()); | ||
} | ||
|
@@ -335,7 +337,14 @@ public void truncateFromStart(long startOffset) { | |
EpochEntry updatedFirstEntry = new EpochEntry(firstBeforeStartOffset.epoch, startOffset); | ||
epochs.put(updatedFirstEntry.epoch, updatedFirstEntry); | ||
|
||
flush(); | ||
// We intentionally don't force flushing change to the device here because: | ||
// - To avoid fsync latency | ||
// * fsync latency could be huge on a disk glitch, which is not rare in spinning drives | ||
// * This method is called as part of deleteRecords with holding UnifiedLog#lock. | ||
// - Meanwhile all produces against the partition will be blocked, which causes req-handlers to exhaust | ||
// - Even when stale epochs remained in LeaderEpoch file due to the unclean shutdown, it will be recovered by | ||
// another truncateFromStart call on log loading procedure so it won't be a problem | ||
writeToFile(false); | ||
|
||
log.debug("Cleared entries {} and rewrote first entry {} after truncating to start offset {}, leaving {} in the cache.", removedEntries, updatedFirstEntry, startOffset, epochs.size()); | ||
} | ||
|
@@ -384,7 +393,7 @@ public void clearAndFlush() { | |
lock.writeLock().lock(); | ||
try { | ||
epochs.clear(); | ||
flush(); | ||
writeToFile(true); | ||
} finally { | ||
lock.writeLock().unlock(); | ||
} | ||
|
@@ -421,16 +430,12 @@ public NavigableMap<Integer, Long> epochWithOffsets() { | |
} | ||
} | ||
|
||
private void flushTo(LeaderEpochCheckpoint leaderEpochCheckpoint) { | ||
private void writeToFile(boolean sync) { | ||
lock.readLock().lock(); | ||
try { | ||
leaderEpochCheckpoint.write(epochs.values()); | ||
this.checkpoint.write(epochs.values(), sync); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need |
||
} finally { | ||
lock.readLock().unlock(); | ||
} | ||
} | ||
|
||
private void flush() { | ||
flushTo(this.checkpoint); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to add a test to verify that the recovery point is only advanced after the producer state has been flushed to disk?