facebook · abhimadan · Oct 1, 2018 · Oct 1, 2018 · Oct 9, 2018 · Oct 9, 2018
diff --git a/db/compaction.cc b/db/compaction.cc
@@ -23,6 +23,43 @@
 
 namespace rocksdb {
 
+const uint64_t kRangeTombstoneSentinel =
+    PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b) {
+  auto c = user_cmp->Compare(a.user_key(), b.user_key());
+  if (c != 0) {
+    return c;
+  }
+  auto a_footer = ExtractInternalKeyFooter(a.Encode());
+  auto b_footer = ExtractInternalKeyFooter(b.Encode());
+  if (a_footer == kRangeTombstoneSentinel) {
+    if (b_footer != kRangeTombstoneSentinel) {
+      return -1;
+    }
+  } else if (b_footer == kRangeTombstoneSentinel) {
+    return 1;
+  }
+  return 0;
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b) {
+  if (a == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, *a, b);
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b) {
+  if (b == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, a, *b);
+}
+
 uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
   uint64_t sum = 0;
   for (size_t i = 0; i < files.size() && files[i]; i++) {
@@ -81,6 +118,49 @@ void Compaction::GetBoundaryKeys(
   }
 }
 
+std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
+    VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
+  const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i].level == 0 || inputs[i].files.empty()) {
+      continue;
+    }
+    inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
+    AtomicCompactionUnitBoundary cur_boundary;
+    size_t first_atomic_idx = 0;
+    auto add_unit_boundary = [&](size_t to) {
+      if (first_atomic_idx == to) return;
+      for (size_t k = first_atomic_idx; k < to; k++) {
+        inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
+      }
+      first_atomic_idx = to;
+    };
+    for (size_t j = 0; j < inputs[i].files.size(); j++) {
+      const auto* f = inputs[i].files[j];
+      if (j == 0) {
+        // First file in a level.
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
+                 0) {
+        // SSTs overlap but the end key of the previous file was not
+        // artificially extended by a range tombstone. Extend the current
+        // boundary.
+        cur_boundary.largest = &f->largest;
+      } else {
+        // Atomic compaction unit has ended.
+        add_unit_boundary(j);
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      }
+    }
+    add_unit_boundary(inputs[i].files.size());
+    assert(inputs[i].files.size() ==
+           inputs[i].atomic_compaction_unit_boundaries.size());
+  }
+  return inputs;
+}
+
 // helper function to determine if compaction is creating files at the
 // bottommost level
 bool Compaction::IsBottommostLevel(
@@ -155,7 +235,7 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
       output_compression_(_compression),
       output_compression_opts_(_compression_opts),
       deletion_compaction_(_deletion_compaction),
-      inputs_(std::move(_inputs)),
+      inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
       grandparents_(std::move(_grandparents)),
       score_(_score),
       bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),

diff --git a/db/compaction.h b/db/compaction.h
@@ -15,11 +15,43 @@
 
 namespace rocksdb {
 
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of  SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+  const InternalKey* smallest = nullptr;
+  const InternalKey* largest = nullptr;
+};
+
 // The structure that manages compaction input files associated
 // with the same physical level.
 struct CompactionInputFiles {
   int level;
   std::vector<FileMetaData*> files;
+  std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
   inline bool empty() const { return files.empty(); }
   inline size_t size() const { return files.size(); }
   inline void clear() { files.clear(); }
@@ -96,6 +128,12 @@ class Compaction {
     return inputs_[compaction_input_level][i];
   }
 
+  const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+      size_t compaction_input_level) const {
+    assert(compaction_input_level < inputs_.size());
+    return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+  }
+
   // Returns the list of file meta data of the specified compaction
   // input level.
   // REQUIREMENT: "compaction_input_level" must be >= 0 and
@@ -262,6 +300,13 @@ class Compaction {
                               const std::vector<CompactionInputFiles>& inputs,
                               Slice* smallest_key, Slice* largest_key);
 
+  // Get the atomic file boundaries for all files in the compaction. Necessary
+  // in order to avoid the scenario described in
+  // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb
+  // down appropriate key boundaries to RangeDelAggregator during compaction.
+  static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+      VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
   // helper function to determine if compaction with inputs and storage is
   // bottommost
   static bool IsBottommostLevel(

diff --git a/db/compaction_job.cc b/db/compaction_job.cc
@@ -1212,10 +1212,11 @@ Status CompactionJob::FinishCompactionOutputFile(
     for (; it->Valid(); it->Next()) {
       auto tombstone = it->Tombstone();
       if (upper_bound != nullptr &&
-          ucmp->Compare(*upper_bound, tombstone.start_key_) <= 0) {
-        // Tombstones starting at upper_bound or later only need to be included
-        // in the next table. Break because subsequent tombstones will start
-        // even later.
+          ucmp->Compare(*upper_bound, tombstone.start_key_) < 0) {
+        // Tombstones starting after upper_bound only need to be included in the
+        // next table (if the SSTs overlap, then upper_bound is contained in
+        // this SST and hence must be covered). Break because subsequent
+        // tombstones will start even later.
         break;
       }