From 64aabc918306aeb53ca4c6e35ddee090317903c5 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Mon, 10 Dec 2018 12:33:03 -0800 Subject: [PATCH] Properly set smallest key of subcompaction output (#4723) Summary: It is possible to see a situation like the following when subcompactions are enabled: 1. A subcompaction boundary is set to `[b, e)`. 2. The first output file in a subcompaction has `c@20` as its smallest key 3. The range tombstone `[a, d)30` is encountered. 4. The tombstone is written to the range-del meta block and the new smallest key is set to `b@0` (since no keys in this subcompaction's output can be smaller than `b`). 5. A key `b@10` in a lower level will now reappear, since it is not covered by the truncated start key `b@0`. In general, unless the smallest data key in a file has a seqnum of 0, it is not safe to truncate a tombstone at the start key to have a seqnum of 0, since it can expose keys with a seqnum greater than 0 but less than the tombstone's actual seqnum. To fix this, when the lower bound of a file is from the subcompaction boundaries, we now set the seqnum of an artificially extended smallest key to the tombstone's seqnum. This is safe because subcompactions operate over disjoint sets of keys, and the subcompactions that can experience this problem are not the first subcompaction (which is unbounded on the left). Furthermore, there is now an assertion to detect the described anomalous case. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4723 Differential Revision: D13236188 Pulled By: abhimadan fbshipit-source-id: a6da6a113f2de1e2ff307ca72e055300c8fe5692 --- db/compaction_job.cc | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 8a878fe725f..dd04d8b205d 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1192,10 +1192,12 @@ Status CompactionJob::FinishCompactionOutputFile( Slice lower_bound_guard, upper_bound_guard; std::string smallest_user_key; const Slice *lower_bound, *upper_bound; + bool lower_bound_from_sub_compact = false; if (sub_compact->outputs.size() == 1) { // For the first output table, include range tombstones before the min key // but after the subcompaction boundary. lower_bound = sub_compact->start; + lower_bound_from_sub_compact = true; } else if (meta->smallest.size() > 0) { // For subsequent output tables, only include range tombstones from min // key onwards since the previous file was extended to contain range @@ -1265,11 +1267,24 @@ Status CompactionJob::FinishCompactionOutputFile( // (the max key in the previous table or subcompaction) in order for // files to appear key-space partitioned. // - // Choose lowest seqnum so this file's smallest internal key comes - // after the previous file's/subcompaction's largest. The fake seqnum - // is OK because the read path's file-picking code only considers user - // key. - smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion); + // When lower_bound is chosen by a subcompaction, we know that + // subcompactions over smaller keys cannot contain any keys at + // lower_bound. We also know that smaller subcompactions exist, because + // otherwise the subcompaction woud be unbounded on the left. As a + // result, we know that no other files on the output level will contain + // actual keys at lower_bound (an output file may have a largest key of + // lower_bound@kMaxSequenceNumber, but this only indicates a large range + // tombstone was truncated). Therefore, it is safe to use the + // tombstone's sequence number, to ensure that keys at lower_bound at + // lower levels are covered by truncated tombstones. + // + // If lower_bound was chosen by the smallest data key in the file, + // choose lowest seqnum so this file's smallest internal key comes after + // the previous file's largest. The fake seqnum is OK because the read + // path's file-picking code only considers user key. + smallest_candidate = InternalKey( + *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0, + kTypeRangeDeletion); } InternalKey largest_candidate = tombstone.SerializeEndKey(); if (upper_bound != nullptr && @@ -1291,9 +1306,23 @@ Status CompactionJob::FinishCompactionOutputFile( largest_candidate = InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); } +#ifndef NDEBUG + SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; + if (meta->smallest.size() > 0) { + smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode()); + } +#endif meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate, tombstone.seq_, cfd->internal_comparator()); + + // The smallest key in a file is used for range tombstone truncation, so + // it cannot have a seqnum of 0 (unless the smallest data key in a file + // has a seqnum of 0). Otherwise, the truncated tombstone may expose + // deleted keys at lower levels. + assert(smallest_ikey_seqnum == 0 || + ExtractInternalKeyFooter(meta->smallest.Encode()) != + PackSequenceAndType(0, kTypeRangeDeletion)); } meta->marked_for_compaction = sub_compact->builder->NeedCompact(); }