From 0bfd0614150ddfaff34b27c4833b4b07ce680a94 Mon Sep 17 00:00:00 2001 From: Wish Date: Fri, 9 Sep 2022 10:57:11 +0800 Subject: [PATCH 01/17] move ingest out Signed-off-by: Wish --- .../Storages/DeltaMerge/DeltaMergeStore.cpp | 224 --------------- .../DeltaMerge/DeltaMergeStore_Ingest.cpp | 263 ++++++++++++++++++ 2 files changed, 263 insertions(+), 224 deletions(-) create mode 100644 dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp index 4a2e452b59c..0f6b0b6461c 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp @@ -88,12 +88,9 @@ extern const char skip_check_segment_update[]; extern const char pause_before_dt_background_delta_merge[]; extern const char pause_until_dt_background_delta_merge[]; extern const char pause_when_writing_to_dt_store[]; -extern const char pause_when_ingesting_to_dt_store[]; extern const char pause_when_altering_dt_store[]; extern const char force_triggle_background_merge_delta[]; extern const char force_triggle_foreground_flush[]; -extern const char force_set_segment_ingest_packs_fail[]; -extern const char segment_merge_after_ingest_packs[]; extern const char random_exception_after_dt_write_done[]; extern const char force_slow_page_storage_snapshot_release[]; extern const char exception_before_drop_segment[]; @@ -744,227 +741,6 @@ void DeltaMergeStore::write(const Context & db_context, const DB::Settings & db_ checkSegmentUpdate(dm_context, segment, ThreadType::Write); } -std::tuple DeltaMergeStore::preAllocateIngestFile() -{ - if (shutdown_called.load(std::memory_order_relaxed)) - return {}; - - auto delegator = path_pool->getStableDiskDelegator(); - auto parent_path = delegator.choosePath(); - auto new_id = storage_pool->newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); - return {parent_path, new_id}; -} - -void DeltaMergeStore::preIngestFile(const String & parent_path, const PageId file_id, size_t file_size) -{ - if (shutdown_called.load(std::memory_order_relaxed)) - return; - - auto delegator = path_pool->getStableDiskDelegator(); - delegator.addDTFile(file_id, file_size, parent_path); -} - -void DeltaMergeStore::ingestFiles( - const DMContextPtr & dm_context, - const RowKeyRange & range, - const PageIds & file_ids, - bool clear_data_in_range) -{ - if (unlikely(shutdown_called.load(std::memory_order_relaxed))) - { - const auto msg = fmt::format("Try to ingest files into a shutdown table, store={}", log->identifier()); - LOG_FMT_WARNING(log, "{}", msg); - throw Exception(msg); - } - - EventRecorder write_block_recorder(ProfileEvents::DMWriteFile, ProfileEvents::DMWriteFileNS); - - auto delegate = dm_context->path_pool.getStableDiskDelegator(); - auto file_provider = dm_context->db_context.getFileProvider(); - - size_t rows = 0; - size_t bytes = 0; - size_t bytes_on_disk = 0; - - DMFiles files; - for (auto file_id : file_ids) - { - auto file_parent_path = delegate.getDTFilePath(file_id); - - // we always create a ref file to this DMFile with all meta info restored later, so here we just restore meta info to calculate its' memory and disk size - auto file = DMFile::restore(file_provider, file_id, file_id, file_parent_path, DMFile::ReadMetaMode::memoryAndDiskSize()); - rows += file->getRows(); - bytes += file->getBytes(); - bytes_on_disk += file->getBytesOnDisk(); - - files.emplace_back(std::move(file)); - } - - { - auto get_ingest_files = [&] { - FmtBuffer fmt_buf; - fmt_buf.append("["); - fmt_buf.joinStr( - file_ids.begin(), - file_ids.end(), - [](const PageId id, FmtBuffer & fb) { fb.fmtAppend("dmf_{}", id); }, - ","); - fmt_buf.append("]"); - return fmt_buf.toString(); - }; - LOG_FMT_INFO( - log, - "Begin table ingest files, files={} rows={} bytes={} bytes_on_disk={} range={} clear={}", - get_ingest_files(), - rows, - bytes, - bytes_on_disk, - range.toDebugString(), - clear_data_in_range); - } - - Segments updated_segments; - RowKeyRange cur_range = range; - - // Put the ingest file ids into `storage_pool` and use ref id in each segments to ensure the atomic - // of ingesting. - // Check https://github.com/pingcap/tics/issues/2040 for more details. - // TODO: If tiflash crash during the middle of ingesting, we may leave some DTFiles on disk and - // they can not be deleted. We should find a way to cleanup those files. - WriteBatches ingest_wbs(*storage_pool, dm_context->getWriteLimiter()); - if (!files.empty()) - { - for (const auto & file : files) - { - ingest_wbs.data.putExternal(file->fileId(), 0); - } - ingest_wbs.writeLogAndData(); - ingest_wbs.setRollback(); // rollback if exception thrown - } - - while (!cur_range.none()) - { - RowKeyRange segment_range; - - // Keep trying until succeeded. - while (true) - { - SegmentPtr segment; - { - std::shared_lock lock(read_write_mutex); - - auto segment_it = segments.upper_bound(cur_range.getStart()); - if (segment_it == segments.end()) - { - throw Exception( - fmt::format("Failed to locate segment begin with start in range: {}", cur_range.toDebugString()), - ErrorCodes::LOGICAL_ERROR); - } - segment = segment_it->second; - } - - FAIL_POINT_PAUSE(FailPoints::pause_when_ingesting_to_dt_store); - waitForWrite(dm_context, segment); - if (segment->hasAbandoned()) - continue; - - segment_range = segment->getRowKeyRange(); - - // Write could fail, because other threads could already updated the instance. Like split/merge, merge delta. - ColumnFiles column_files; - WriteBatches wbs(*storage_pool, dm_context->getWriteLimiter()); - - for (const auto & file : files) - { - /// Generate DMFile instance with a new ref_id pointed to the file_id. - auto file_id = file->fileId(); - const auto & file_parent_path = file->parentPath(); - auto page_id = storage_pool->newDataPageIdForDTFile(delegate, __PRETTY_FUNCTION__); - - auto ref_file = DMFile::restore(file_provider, file_id, page_id, file_parent_path, DMFile::ReadMetaMode::all()); - auto column_file = std::make_shared(*dm_context, ref_file, segment_range); - if (column_file->getRows() != 0) - { - column_files.emplace_back(std::move(column_file)); - wbs.data.putRefPage(page_id, file->pageId()); - } - } - - // We have to commit those file_ids to PageStorage, because as soon as packs are written into segments, - // they are visible for readers who require file_ids to be found in PageStorage. - wbs.writeLogAndData(); - - bool ingest_success = segment->ingestColumnFiles(*dm_context, range.shrink(segment_range), column_files, clear_data_in_range); - fiu_do_on(FailPoints::force_set_segment_ingest_packs_fail, { ingest_success = false; }); - if (ingest_success) - { - updated_segments.push_back(segment); - fiu_do_on(FailPoints::segment_merge_after_ingest_packs, { - segment->flushCache(*dm_context); - segmentMergeDelta(*dm_context, segment, TaskRunThread::BackgroundThreadPool); - storage_pool->gc(global_context.getSettingsRef(), StoragePool::Seconds(0)); - }); - break; - } - else - { - wbs.rollbackWrittenLogAndData(); - } - } - - cur_range.setStart(segment_range.end); - cur_range.setEnd(range.end); - } - - // Enable gc for DTFile after all segment applied. - // Note that we can not enable gc for them once they have applied to any segments. - // Assume that one segment get compacted after file ingested, `gc_handle` gc the - // DTFiles before they get applied to all segments. Then we will apply some - // deleted DTFiles to other segments. - for (const auto & file : files) - file->enableGC(); - // After the ingest DTFiles applied, remove the original page - ingest_wbs.rollbackWrittenLogAndData(); - - { - // Add some logging about the ingested file ids and updated segments - // Example: "ingested_files=[dmf_1001,dmf_1002,dmf_1003] updated_segments=[,]" - // "ingested_files=[] updated_segments=[,]" - auto get_ingest_info = [&] { - FmtBuffer fmt_buf; - fmt_buf.append("ingested_files=["); - fmt_buf.joinStr( - file_ids.begin(), - file_ids.end(), - [](const PageId id, FmtBuffer & fb) { fb.fmtAppend("dmf_{}", id); }, - ","); - fmt_buf.append("] updated_segments=["); - fmt_buf.joinStr( - updated_segments.begin(), - updated_segments.end(), - [](const auto & segment, FmtBuffer & fb) { fb.fmtAppend("{}", segment->simpleInfo()); }, - ","); - fmt_buf.append("]"); - return fmt_buf.toString(); - }; - - LOG_FMT_INFO( - log, - "Finish table ingest files, ingested files into segments, {} clear={}", - get_ingest_info(), - clear_data_in_range); - } - - GET_METRIC(tiflash_storage_throughput_bytes, type_ingest).Increment(bytes); - GET_METRIC(tiflash_storage_throughput_rows, type_ingest).Increment(rows); - - flushCache(dm_context, range); - - // TODO: Update the tracing_id before checkSegmentUpdate? - for (auto & segment : updated_segments) - checkSegmentUpdate(dm_context, segment, ThreadType::Write); -} - void DeltaMergeStore::deleteRange(const Context & db_context, const DB::Settings & db_settings, const RowKeyRange & delete_range) { LOG_FMT_INFO(log, "Table delete range, range={}", delete_range.toDebugString()); diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp new file mode 100644 index 00000000000..b76ec4c6d64 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp @@ -0,0 +1,263 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +namespace ProfileEvents +{ +extern const Event DMWriteFile; +extern const Event DMWriteFileNS; + +} // namespace ProfileEvents + +namespace DB +{ + +namespace FailPoints +{ +extern const char pause_when_ingesting_to_dt_store[]; +extern const char force_set_segment_ingest_packs_fail[]; +extern const char segment_merge_after_ingest_packs[]; +} // namespace FailPoints + +namespace DM +{ + +std::tuple DeltaMergeStore::preAllocateIngestFile() +{ + if (shutdown_called.load(std::memory_order_relaxed)) + return {}; + + auto delegator = path_pool->getStableDiskDelegator(); + auto parent_path = delegator.choosePath(); + auto new_id = storage_pool->newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); + return {parent_path, new_id}; +} + +void DeltaMergeStore::preIngestFile(const String & parent_path, const PageId file_id, size_t file_size) +{ + if (shutdown_called.load(std::memory_order_relaxed)) + return; + + auto delegator = path_pool->getStableDiskDelegator(); + delegator.addDTFile(file_id, file_size, parent_path); +} + +void DeltaMergeStore::ingestFiles( + const DMContextPtr & dm_context, + const RowKeyRange & range, + const PageIds & file_ids, + bool clear_data_in_range) +{ + if (unlikely(shutdown_called.load(std::memory_order_relaxed))) + { + const auto msg = fmt::format("Try to ingest files into a shutdown table, store={}", log->identifier()); + LOG_FMT_WARNING(log, "{}", msg); + throw Exception(msg); + } + + EventRecorder write_block_recorder(ProfileEvents::DMWriteFile, ProfileEvents::DMWriteFileNS); + + auto delegate = dm_context->path_pool.getStableDiskDelegator(); + auto file_provider = dm_context->db_context.getFileProvider(); + + size_t rows = 0; + size_t bytes = 0; + size_t bytes_on_disk = 0; + + DMFiles files; + for (auto file_id : file_ids) + { + auto file_parent_path = delegate.getDTFilePath(file_id); + + // we always create a ref file to this DMFile with all meta info restored later, so here we just restore meta info to calculate its' memory and disk size + auto file = DMFile::restore(file_provider, file_id, file_id, file_parent_path, DMFile::ReadMetaMode::memoryAndDiskSize()); + rows += file->getRows(); + bytes += file->getBytes(); + bytes_on_disk += file->getBytesOnDisk(); + + files.emplace_back(std::move(file)); + } + + { + auto get_ingest_files = [&] { + FmtBuffer fmt_buf; + fmt_buf.append("["); + fmt_buf.joinStr( + file_ids.begin(), + file_ids.end(), + [](const PageId id, FmtBuffer & fb) { fb.fmtAppend("dmf_{}", id); }, + ","); + fmt_buf.append("]"); + return fmt_buf.toString(); + }; + LOG_FMT_INFO( + log, + "Begin table ingest files, files={} rows={} bytes={} bytes_on_disk={} range={} clear={}", + get_ingest_files(), + rows, + bytes, + bytes_on_disk, + range.toDebugString(), + clear_data_in_range); + } + + Segments updated_segments; + RowKeyRange cur_range = range; + + // Put the ingest file ids into `storage_pool` and use ref id in each segments to ensure the atomic + // of ingesting. + // Check https://github.com/pingcap/tics/issues/2040 for more details. + // TODO: If tiflash crash during the middle of ingesting, we may leave some DTFiles on disk and + // they can not be deleted. We should find a way to cleanup those files. + WriteBatches ingest_wbs(*storage_pool, dm_context->getWriteLimiter()); + if (!files.empty()) + { + for (const auto & file : files) + { + ingest_wbs.data.putExternal(file->fileId(), 0); + } + ingest_wbs.writeLogAndData(); + ingest_wbs.setRollback(); // rollback if exception thrown + } + + while (!cur_range.none()) + { + RowKeyRange segment_range; + + // Keep trying until succeeded. + while (true) + { + SegmentPtr segment; + { + std::shared_lock lock(read_write_mutex); + + auto segment_it = segments.upper_bound(cur_range.getStart()); + if (segment_it == segments.end()) + { + throw Exception( + fmt::format("Failed to locate segment begin with start in range: {}", cur_range.toDebugString()), + ErrorCodes::LOGICAL_ERROR); + } + segment = segment_it->second; + } + + FAIL_POINT_PAUSE(FailPoints::pause_when_ingesting_to_dt_store); + waitForWrite(dm_context, segment); + if (segment->hasAbandoned()) + continue; + + segment_range = segment->getRowKeyRange(); + + // Write could fail, because other threads could already updated the instance. Like split/merge, merge delta. + ColumnFiles column_files; + WriteBatches wbs(*storage_pool, dm_context->getWriteLimiter()); + + for (const auto & file : files) + { + /// Generate DMFile instance with a new ref_id pointed to the file_id. + auto file_id = file->fileId(); + const auto & file_parent_path = file->parentPath(); + auto page_id = storage_pool->newDataPageIdForDTFile(delegate, __PRETTY_FUNCTION__); + + auto ref_file = DMFile::restore(file_provider, file_id, page_id, file_parent_path, DMFile::ReadMetaMode::all()); + auto column_file = std::make_shared(*dm_context, ref_file, segment_range); + if (column_file->getRows() != 0) + { + column_files.emplace_back(std::move(column_file)); + wbs.data.putRefPage(page_id, file->pageId()); + } + } + + // We have to commit those file_ids to PageStorage, because as soon as packs are written into segments, + // they are visible for readers who require file_ids to be found in PageStorage. + wbs.writeLogAndData(); + + bool ingest_success = segment->ingestColumnFiles(*dm_context, range.shrink(segment_range), column_files, clear_data_in_range); + fiu_do_on(FailPoints::force_set_segment_ingest_packs_fail, { ingest_success = false; }); + if (ingest_success) + { + updated_segments.push_back(segment); + fiu_do_on(FailPoints::segment_merge_after_ingest_packs, { + segment->flushCache(*dm_context); + segmentMergeDelta(*dm_context, segment, TaskRunThread::BackgroundThreadPool); + storage_pool->gc(global_context.getSettingsRef(), StoragePool::Seconds(0)); + }); + break; + } + else + { + wbs.rollbackWrittenLogAndData(); + } + } + + cur_range.setStart(segment_range.end); + cur_range.setEnd(range.end); + } + + // Enable gc for DTFile after all segment applied. + // Note that we can not enable gc for them once they have applied to any segments. + // Assume that one segment get compacted after file ingested, `gc_handle` gc the + // DTFiles before they get applied to all segments. Then we will apply some + // deleted DTFiles to other segments. + for (const auto & file : files) + file->enableGC(); + // After the ingest DTFiles applied, remove the original page + ingest_wbs.rollbackWrittenLogAndData(); + + { + // Add some logging about the ingested file ids and updated segments + // Example: "ingested_files=[dmf_1001,dmf_1002,dmf_1003] updated_segments=[,]" + // "ingested_files=[] updated_segments=[,]" + auto get_ingest_info = [&] { + FmtBuffer fmt_buf; + fmt_buf.append("ingested_files=["); + fmt_buf.joinStr( + file_ids.begin(), + file_ids.end(), + [](const PageId id, FmtBuffer & fb) { fb.fmtAppend("dmf_{}", id); }, + ","); + fmt_buf.append("] updated_segments=["); + fmt_buf.joinStr( + updated_segments.begin(), + updated_segments.end(), + [](const auto & segment, FmtBuffer & fb) { fb.fmtAppend("{}", segment->simpleInfo()); }, + ","); + fmt_buf.append("]"); + return fmt_buf.toString(); + }; + + LOG_FMT_INFO( + log, + "Finish table ingest files, ingested files into segments, {} clear={}", + get_ingest_info(), + clear_data_in_range); + } + + GET_METRIC(tiflash_storage_throughput_bytes, type_ingest).Increment(bytes); + GET_METRIC(tiflash_storage_throughput_rows, type_ingest).Increment(rows); + + flushCache(dm_context, range); + + // TODO: Update the tracing_id before checkSegmentUpdate? + for (auto & segment : updated_segments) + checkSegmentUpdate(dm_context, segment, ThreadType::Write); +} + +} +} From 9d52cdc635d617fe4ee4f8a0e7801e8207230f14 Mon Sep 17 00:00:00 2001 From: Wish Date: Fri, 9 Sep 2022 11:12:17 +0800 Subject: [PATCH 02/17] move internal segment ops out Signed-off-by: Wish --- .../Storages/DeltaMerge/DeltaMergeStore.cpp | 434 ---------------- .../DeltaMergeStore_InternalSegment.cpp | 468 ++++++++++++++++++ 2 files changed, 468 insertions(+), 434 deletions(-) create mode 100644 dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp index 0f6b0b6461c..658ef8d63bc 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp @@ -62,15 +62,10 @@ extern const Event DMAppendDeltaCleanUpNS; namespace CurrentMetrics { -extern const Metric DT_DeltaMerge; -extern const Metric DT_SegmentSplit; -extern const Metric DT_SegmentMerge; extern const Metric DT_DeltaMergeTotalBytes; extern const Metric DT_DeltaMergeTotalRows; extern const Metric DT_SnapshotOfRead; extern const Metric DT_SnapshotOfReadRaw; -extern const Metric DT_SnapshotOfSegmentSplit; -extern const Metric DT_SnapshotOfSegmentMerge; extern const Metric DT_SnapshotOfDeltaMerge; extern const Metric DT_SnapshotOfPlaceIndex; } // namespace CurrentMetrics @@ -1834,435 +1829,6 @@ UInt64 DeltaMergeStore::onSyncGc(Int64 limit) return gc_segments_num; } -SegmentPair DeltaMergeStore::segmentSplit(DMContext & dm_context, const SegmentPtr & segment, bool is_foreground) -{ - LOG_FMT_INFO( - log, - "Split - Begin, is_foreground={} safe_point={} segment={}", - is_foreground, - dm_context.min_version, - segment->info()); - - SegmentSnapshotPtr segment_snap; - ColumnDefinesPtr schema_snap; - - { - std::shared_lock lock(read_write_mutex); - - if (!isSegmentValid(lock, segment)) - { - LOG_FMT_DEBUG(log, "Split - Give up segmentSplit because not valid, segment={}", segment->simpleInfo()); - return {}; - } - - segment_snap = segment->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentSplit); - if (!segment_snap || !segment_snap->getRows()) - { - LOG_FMT_DEBUG(log, "Split - Give up segmentSplit because snapshot failed or no row, segment={}", segment->simpleInfo()); - return {}; - } - schema_snap = store_columns; - } - - // Not counting the early give up action. - auto delta_bytes = static_cast(segment_snap->delta->getBytes()); - auto delta_rows = static_cast(segment_snap->delta->getRows()); - - size_t duplicated_bytes = 0; - size_t duplicated_rows = 0; - - CurrentMetrics::Increment cur_dm_segments{CurrentMetrics::DT_SegmentSplit}; - if (is_foreground) - GET_METRIC(tiflash_storage_subtask_count, type_seg_split_fg).Increment(); - else - GET_METRIC(tiflash_storage_subtask_count, type_seg_split).Increment(); - Stopwatch watch_seg_split; - SCOPE_EXIT({ - if (is_foreground) - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_seg_split_fg).Observe(watch_seg_split.elapsedSeconds()); - else - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_seg_split).Observe(watch_seg_split.elapsedSeconds()); - }); - - WriteBatches wbs(*storage_pool, dm_context.getWriteLimiter()); - - auto range = segment->getRowKeyRange(); - auto split_info_opt = segment->prepareSplit(dm_context, schema_snap, segment_snap, wbs); - - if (!split_info_opt.has_value()) - { - // Likely we can not find an appropriate split point for this segment later, forbid the split until this segment get updated through applying delta-merge. Or it will slow down the write a lot. - segment->forbidSplit(); - LOG_FMT_WARNING(log, "Split - Give up segmentSplit and forbid later split because of prepare split failed, segment={}", segment->simpleInfo()); - return {}; - } - - auto & split_info = split_info_opt.value(); - - wbs.writeLogAndData(); - split_info.my_stable->enableDMFilesGC(); - split_info.other_stable->enableDMFilesGC(); - - SegmentPtr new_left, new_right; - { - std::unique_lock lock(read_write_mutex); - - if (!isSegmentValid(lock, segment)) - { - LOG_FMT_DEBUG(log, "Split - Give up segmentSplit because not valid, segment={}", segment->simpleInfo()); - wbs.setRollback(); - return {}; - } - - auto segment_lock = segment->mustGetUpdateLock(); - - std::tie(new_left, new_right) = segment->applySplit(dm_context, segment_snap, wbs, split_info); - - wbs.writeMeta(); - - segment->abandon(dm_context); - segments.erase(range.getEnd()); - id_to_segment.erase(segment->segmentId()); - - segments[new_left->getRowKeyRange().getEnd()] = new_left; - segments[new_right->getRowKeyRange().getEnd()] = new_right; - - id_to_segment.emplace(new_left->segmentId(), new_left); - id_to_segment.emplace(new_right->segmentId(), new_right); - - if constexpr (DM_RUN_CHECK) - { - new_left->check(dm_context, "After split left"); - new_right->check(dm_context, "After split right"); - } - - duplicated_bytes = new_left->getDelta()->getBytes(); - duplicated_rows = new_right->getDelta()->getBytes(); - - LOG_FMT_INFO(log, "Split - {} - Finish, segment is split into two, old_segment={} new_left={} new_right={}", split_info.is_logical ? "SplitLogical" : "SplitPhysical", segment->info(), new_left->info(), new_right->info()); - } - - wbs.writeRemoves(); - - if (!split_info.is_logical) - { - GET_METRIC(tiflash_storage_throughput_bytes, type_split).Increment(delta_bytes); - GET_METRIC(tiflash_storage_throughput_rows, type_split).Increment(delta_rows); - } - else - { - // For logical split, delta is duplicated into two segments. And will be merged into stable twice later. So we need to decrease it here. - // Otherwise the final total delta merge bytes is greater than bytes written into. - GET_METRIC(tiflash_storage_throughput_bytes, type_split).Decrement(duplicated_bytes); - GET_METRIC(tiflash_storage_throughput_rows, type_split).Decrement(duplicated_rows); - } - - if constexpr (DM_RUN_CHECK) - check(dm_context.db_context); - - return {new_left, new_right}; -} - -void DeltaMergeStore::segmentMerge(DMContext & dm_context, const SegmentPtr & left, const SegmentPtr & right, bool is_foreground) -{ - LOG_FMT_INFO( - log, - "Merge - Begin, is_foreground={} safe_point={} left={} right={}", - is_foreground, - dm_context.min_version, - left->info(), - right->info()); - - /// This segment may contain some rows that not belong to this segment range which is left by previous split operation. - /// And only saved data in this segment will be filtered by the segment range in the merge process, - /// unsaved data will be directly copied to the new segment. - /// So we flush here to make sure that all potential data left by previous split operation is saved. - while (!left->flushCache(dm_context)) - { - // keep flush until success if not abandoned - if (left->hasAbandoned()) - { - LOG_FMT_INFO(log, "Merge - Give up segmentMerge because left abandoned, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return; - } - } - while (!right->flushCache(dm_context)) - { - // keep flush until success if not abandoned - if (right->hasAbandoned()) - { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because right abandoned, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return; - } - } - - SegmentSnapshotPtr left_snap; - SegmentSnapshotPtr right_snap; - ColumnDefinesPtr schema_snap; - - { - std::shared_lock lock(read_write_mutex); - - if (!isSegmentValid(lock, left)) - { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because left not valid, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return; - } - if (!isSegmentValid(lock, right)) - { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because right not valid, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return; - } - - left_snap = left->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentMerge); - right_snap = right->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentMerge); - - if (!left_snap || !right_snap) - { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because snapshot failed, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return; - } - schema_snap = store_columns; - } - - // Not counting the early give up action. - auto delta_bytes = static_cast(left_snap->delta->getBytes()) + right_snap->delta->getBytes(); - auto delta_rows = static_cast(left_snap->delta->getRows()) + right_snap->delta->getRows(); - - CurrentMetrics::Increment cur_dm_segments{CurrentMetrics::DT_SegmentMerge}; - if (is_foreground) - GET_METRIC(tiflash_storage_subtask_count, type_seg_merge_fg).Increment(); - else - GET_METRIC(tiflash_storage_subtask_count, type_seg_merge).Increment(); - Stopwatch watch_seg_merge; - SCOPE_EXIT({ - if (is_foreground) - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_seg_merge_fg).Observe(watch_seg_merge.elapsedSeconds()); - else - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_seg_merge).Observe(watch_seg_merge.elapsedSeconds()); - }); - - auto left_range = left->getRowKeyRange(); - auto right_range = right->getRowKeyRange(); - - WriteBatches wbs(*storage_pool, dm_context.getWriteLimiter()); - auto merged_stable = Segment::prepareMerge(dm_context, schema_snap, left, left_snap, right, right_snap, wbs); - wbs.writeLogAndData(); - merged_stable->enableDMFilesGC(); - - { - std::unique_lock lock(read_write_mutex); - - if (!isSegmentValid(lock, left) || !isSegmentValid(lock, right)) - { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because left or right not valid, left={} right={}", left->simpleInfo(), right->simpleInfo()); - wbs.setRollback(); - return; - } - - auto left_lock = left->mustGetUpdateLock(); - auto right_lock = right->mustGetUpdateLock(); - - auto merged = Segment::applyMerge(dm_context, left, left_snap, right, right_snap, wbs, merged_stable); - - wbs.writeMeta(); - - left->abandon(dm_context); - right->abandon(dm_context); - segments.erase(left_range.getEnd()); - segments.erase(right_range.getEnd()); - id_to_segment.erase(left->segmentId()); - id_to_segment.erase(right->segmentId()); - - segments.emplace(merged->getRowKeyRange().getEnd(), merged); - id_to_segment.emplace(merged->segmentId(), merged); - - if constexpr (DM_RUN_CHECK) - merged->check(dm_context, "After segment merge"); - - LOG_FMT_INFO( - log, - "Merge - Finish, two segments are merged into one, is_foreground={} left={} right={} merged={}", - is_foreground, - dm_context.min_version, - left->info(), - right->info(), - merged->info()); - } - - wbs.writeRemoves(); - - GET_METRIC(tiflash_storage_throughput_bytes, type_merge).Increment(delta_bytes); - GET_METRIC(tiflash_storage_throughput_rows, type_merge).Increment(delta_rows); - - if constexpr (DM_RUN_CHECK) - check(dm_context.db_context); -} - -SegmentPtr DeltaMergeStore::segmentMergeDelta( - DMContext & dm_context, - const SegmentPtr & segment, - const TaskRunThread run_thread, - SegmentSnapshotPtr segment_snap) -{ - LOG_FMT_INFO(log, "MergeDelta - Begin, thread={} safe_point={} segment={}", toString(run_thread), dm_context.min_version, segment->info()); - - ColumnDefinesPtr schema_snap; - - { - std::shared_lock lock(read_write_mutex); - - if (!isSegmentValid(lock, segment)) - { - LOG_FMT_DEBUG(log, "MergeDelta - Give up segmentMergeDelta because segment not valid, segment={}", segment->simpleInfo()); - return {}; - } - - // Try to generate a new snapshot if there is no pre-allocated one - if (!segment_snap) - segment_snap = segment->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfDeltaMerge); - - if (unlikely(!segment_snap)) - { - LOG_FMT_DEBUG(log, "MergeDelta - Give up segmentMergeDelta because snapshot failed, segment={}", segment->simpleInfo()); - return {}; - } - schema_snap = store_columns; - } - - // Not counting the early give up action. - auto delta_bytes = static_cast(segment_snap->delta->getBytes()); - auto delta_rows = static_cast(segment_snap->delta->getRows()); - - CurrentMetrics::Increment cur_dm_segments{CurrentMetrics::DT_DeltaMerge}; - CurrentMetrics::Increment cur_dm_total_bytes{CurrentMetrics::DT_DeltaMergeTotalBytes, static_cast(segment_snap->getBytes())}; - CurrentMetrics::Increment cur_dm_total_rows{CurrentMetrics::DT_DeltaMergeTotalRows, static_cast(segment_snap->getRows())}; - - switch (run_thread) - { - case TaskRunThread::BackgroundThreadPool: - GET_METRIC(tiflash_storage_subtask_count, type_delta_merge).Increment(); - break; - case TaskRunThread::Foreground: - GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_fg).Increment(); - break; - case TaskRunThread::ForegroundRPC: - GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_fg_rpc).Increment(); - break; - case TaskRunThread::BackgroundGCThread: - GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_bg_gc).Increment(); - break; - default: - break; - } - - Stopwatch watch_delta_merge; - SCOPE_EXIT({ - switch (run_thread) - { - case TaskRunThread::BackgroundThreadPool: - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge).Observe(watch_delta_merge.elapsedSeconds()); - break; - case TaskRunThread::Foreground: - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_fg).Observe(watch_delta_merge.elapsedSeconds()); - break; - case TaskRunThread::ForegroundRPC: - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_fg_rpc).Observe(watch_delta_merge.elapsedSeconds()); - break; - case TaskRunThread::BackgroundGCThread: - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_bg_gc).Observe(watch_delta_merge.elapsedSeconds()); - break; - default: - break; - } - }); - - WriteBatches wbs(*storage_pool, dm_context.getWriteLimiter()); - - auto new_stable = segment->prepareMergeDelta(dm_context, schema_snap, segment_snap, wbs); - wbs.writeLogAndData(); - new_stable->enableDMFilesGC(); - - SegmentPtr new_segment; - { - std::unique_lock read_write_lock(read_write_mutex); - - if (!isSegmentValid(read_write_lock, segment)) - { - LOG_FMT_DEBUG(log, "MergeDelta - Give up segmentMergeDelta because segment not valid, segment={}", segment->simpleInfo()); - wbs.setRollback(); - return {}; - } - - auto segment_lock = segment->mustGetUpdateLock(); - - new_segment = segment->applyMergeDelta(dm_context, segment_snap, wbs, new_stable); - - wbs.writeMeta(); - - - // The instance of PKRange::End is closely linked to instance of PKRange. So we cannot reuse it. - // Replace must be done by erase + insert. - segments.erase(segment->getRowKeyRange().getEnd()); - id_to_segment.erase(segment->segmentId()); - - segments[new_segment->getRowKeyRange().getEnd()] = new_segment; - id_to_segment[new_segment->segmentId()] = new_segment; - - segment->abandon(dm_context); - - if constexpr (DM_RUN_CHECK) - { - new_segment->check(dm_context, "After segmentMergeDelta"); - } - - LOG_FMT_INFO(log, "MergeDelta - Finish, delta is merged, old_segment={} new_segment={}", segment->info(), new_segment->info()); - } - - wbs.writeRemoves(); - - GET_METRIC(tiflash_storage_throughput_bytes, type_delta_merge).Increment(delta_bytes); - GET_METRIC(tiflash_storage_throughput_rows, type_delta_merge).Increment(delta_rows); - - if constexpr (DM_RUN_CHECK) - check(dm_context.db_context); - - return new_segment; -} - -bool DeltaMergeStore::doIsSegmentValid(const SegmentPtr & segment) -{ - if (segment->hasAbandoned()) - { - LOG_FMT_DEBUG(log, "Segment instance is abandoned, segment={}", segment->simpleInfo()); - return false; - } - // Segment instance could have been removed or replaced. - auto it = segments.find(segment->getRowKeyRange().getEnd()); - if (it == segments.end()) - { - LOG_FMT_DEBUG(log, "Segment not found in segment map, segment={}", segment->simpleInfo()); - - auto it2 = id_to_segment.find(segment->segmentId()); - if (it2 != id_to_segment.end()) - { - LOG_FMT_DEBUG( - log, - "Found segment with same id in id_to_segment, found_segment={} my_segment={}", - it2->second->info(), - segment->info()); - } - return false; - } - auto & cur_segment = it->second; - if (cur_segment.get() != segment.get()) - { - LOG_FMT_DEBUG(log, "Segment instance has been replaced in segment map, segment={}", segment->simpleInfo()); - return false; - } - return true; -} - void DeltaMergeStore::check(const Context & /*db_context*/) { std::shared_lock lock(read_write_mutex); diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp new file mode 100644 index 00000000000..622124bceb4 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp @@ -0,0 +1,468 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +namespace CurrentMetrics +{ +extern const Metric DT_DeltaMerge; +extern const Metric DT_DeltaMergeTotalBytes; +extern const Metric DT_DeltaMergeTotalRows; +extern const Metric DT_SegmentSplit; +extern const Metric DT_SegmentMerge; +extern const Metric DT_SnapshotOfSegmentSplit; +extern const Metric DT_SnapshotOfSegmentMerge; +extern const Metric DT_SnapshotOfDeltaMerge; +} // namespace CurrentMetrics + +namespace DB +{ + +namespace DM +{ + +SegmentPair DeltaMergeStore::segmentSplit(DMContext & dm_context, const SegmentPtr & segment, bool is_foreground) +{ + LOG_FMT_INFO( + log, + "Split - Begin, is_foreground={} safe_point={} segment={}", + is_foreground, + dm_context.min_version, + segment->info()); + + SegmentSnapshotPtr segment_snap; + ColumnDefinesPtr schema_snap; + + { + std::shared_lock lock(read_write_mutex); + + if (!isSegmentValid(lock, segment)) + { + LOG_FMT_DEBUG(log, "Split - Give up segmentSplit because not valid, segment={}", segment->simpleInfo()); + return {}; + } + + segment_snap = segment->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentSplit); + if (!segment_snap || !segment_snap->getRows()) + { + LOG_FMT_DEBUG(log, "Split - Give up segmentSplit because snapshot failed or no row, segment={}", segment->simpleInfo()); + return {}; + } + schema_snap = store_columns; + } + + // Not counting the early give up action. + auto delta_bytes = static_cast(segment_snap->delta->getBytes()); + auto delta_rows = static_cast(segment_snap->delta->getRows()); + + size_t duplicated_bytes = 0; + size_t duplicated_rows = 0; + + CurrentMetrics::Increment cur_dm_segments{CurrentMetrics::DT_SegmentSplit}; + if (is_foreground) + GET_METRIC(tiflash_storage_subtask_count, type_seg_split_fg).Increment(); + else + GET_METRIC(tiflash_storage_subtask_count, type_seg_split).Increment(); + Stopwatch watch_seg_split; + SCOPE_EXIT({ + if (is_foreground) + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_seg_split_fg).Observe(watch_seg_split.elapsedSeconds()); + else + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_seg_split).Observe(watch_seg_split.elapsedSeconds()); + }); + + WriteBatches wbs(*storage_pool, dm_context.getWriteLimiter()); + + auto range = segment->getRowKeyRange(); + auto split_info_opt = segment->prepareSplit(dm_context, schema_snap, segment_snap, wbs); + + if (!split_info_opt.has_value()) + { + // Likely we can not find an appropriate split point for this segment later, forbid the split until this segment get updated through applying delta-merge. Or it will slow down the write a lot. + segment->forbidSplit(); + LOG_FMT_WARNING(log, "Split - Give up segmentSplit and forbid later split because of prepare split failed, segment={}", segment->simpleInfo()); + return {}; + } + + auto & split_info = split_info_opt.value(); + + wbs.writeLogAndData(); + split_info.my_stable->enableDMFilesGC(); + split_info.other_stable->enableDMFilesGC(); + + SegmentPtr new_left, new_right; + { + std::unique_lock lock(read_write_mutex); + + if (!isSegmentValid(lock, segment)) + { + LOG_FMT_DEBUG(log, "Split - Give up segmentSplit because not valid, segment={}", segment->simpleInfo()); + wbs.setRollback(); + return {}; + } + + auto segment_lock = segment->mustGetUpdateLock(); + + std::tie(new_left, new_right) = segment->applySplit(dm_context, segment_snap, wbs, split_info); + + wbs.writeMeta(); + + segment->abandon(dm_context); + segments.erase(range.getEnd()); + id_to_segment.erase(segment->segmentId()); + + segments[new_left->getRowKeyRange().getEnd()] = new_left; + segments[new_right->getRowKeyRange().getEnd()] = new_right; + + id_to_segment.emplace(new_left->segmentId(), new_left); + id_to_segment.emplace(new_right->segmentId(), new_right); + + if constexpr (DM_RUN_CHECK) + { + new_left->check(dm_context, "After split left"); + new_right->check(dm_context, "After split right"); + } + + duplicated_bytes = new_left->getDelta()->getBytes(); + duplicated_rows = new_right->getDelta()->getBytes(); + + LOG_FMT_INFO(log, "Split - {} - Finish, segment is split into two, old_segment={} new_left={} new_right={}", split_info.is_logical ? "SplitLogical" : "SplitPhysical", segment->info(), new_left->info(), new_right->info()); + } + + wbs.writeRemoves(); + + if (!split_info.is_logical) + { + GET_METRIC(tiflash_storage_throughput_bytes, type_split).Increment(delta_bytes); + GET_METRIC(tiflash_storage_throughput_rows, type_split).Increment(delta_rows); + } + else + { + // For logical split, delta is duplicated into two segments. And will be merged into stable twice later. So we need to decrease it here. + // Otherwise the final total delta merge bytes is greater than bytes written into. + GET_METRIC(tiflash_storage_throughput_bytes, type_split).Decrement(duplicated_bytes); + GET_METRIC(tiflash_storage_throughput_rows, type_split).Decrement(duplicated_rows); + } + + if constexpr (DM_RUN_CHECK) + check(dm_context.db_context); + + return {new_left, new_right}; +} + +void DeltaMergeStore::segmentMerge(DMContext & dm_context, const SegmentPtr & left, const SegmentPtr & right, bool is_foreground) +{ + LOG_FMT_INFO( + log, + "Merge - Begin, is_foreground={} safe_point={} left={} right={}", + is_foreground, + dm_context.min_version, + left->info(), + right->info()); + + /// This segment may contain some rows that not belong to this segment range which is left by previous split operation. + /// And only saved data in this segment will be filtered by the segment range in the merge process, + /// unsaved data will be directly copied to the new segment. + /// So we flush here to make sure that all potential data left by previous split operation is saved. + while (!left->flushCache(dm_context)) + { + // keep flush until success if not abandoned + if (left->hasAbandoned()) + { + LOG_FMT_INFO(log, "Merge - Give up segmentMerge because left abandoned, left={} right={}", left->simpleInfo(), right->simpleInfo()); + return; + } + } + while (!right->flushCache(dm_context)) + { + // keep flush until success if not abandoned + if (right->hasAbandoned()) + { + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because right abandoned, left={} right={}", left->simpleInfo(), right->simpleInfo()); + return; + } + } + + SegmentSnapshotPtr left_snap; + SegmentSnapshotPtr right_snap; + ColumnDefinesPtr schema_snap; + + { + std::shared_lock lock(read_write_mutex); + + if (!isSegmentValid(lock, left)) + { + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because left not valid, left={} right={}", left->simpleInfo(), right->simpleInfo()); + return; + } + if (!isSegmentValid(lock, right)) + { + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because right not valid, left={} right={}", left->simpleInfo(), right->simpleInfo()); + return; + } + + left_snap = left->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentMerge); + right_snap = right->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentMerge); + + if (!left_snap || !right_snap) + { + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because snapshot failed, left={} right={}", left->simpleInfo(), right->simpleInfo()); + return; + } + schema_snap = store_columns; + } + + // Not counting the early give up action. + auto delta_bytes = static_cast(left_snap->delta->getBytes()) + right_snap->delta->getBytes(); + auto delta_rows = static_cast(left_snap->delta->getRows()) + right_snap->delta->getRows(); + + CurrentMetrics::Increment cur_dm_segments{CurrentMetrics::DT_SegmentMerge}; + if (is_foreground) + GET_METRIC(tiflash_storage_subtask_count, type_seg_merge_fg).Increment(); + else + GET_METRIC(tiflash_storage_subtask_count, type_seg_merge).Increment(); + Stopwatch watch_seg_merge; + SCOPE_EXIT({ + if (is_foreground) + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_seg_merge_fg).Observe(watch_seg_merge.elapsedSeconds()); + else + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_seg_merge).Observe(watch_seg_merge.elapsedSeconds()); + }); + + auto left_range = left->getRowKeyRange(); + auto right_range = right->getRowKeyRange(); + + WriteBatches wbs(*storage_pool, dm_context.getWriteLimiter()); + auto merged_stable = Segment::prepareMerge(dm_context, schema_snap, left, left_snap, right, right_snap, wbs); + wbs.writeLogAndData(); + merged_stable->enableDMFilesGC(); + + { + std::unique_lock lock(read_write_mutex); + + if (!isSegmentValid(lock, left) || !isSegmentValid(lock, right)) + { + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because left or right not valid, left={} right={}", left->simpleInfo(), right->simpleInfo()); + wbs.setRollback(); + return; + } + + auto left_lock = left->mustGetUpdateLock(); + auto right_lock = right->mustGetUpdateLock(); + + auto merged = Segment::applyMerge(dm_context, left, left_snap, right, right_snap, wbs, merged_stable); + + wbs.writeMeta(); + + left->abandon(dm_context); + right->abandon(dm_context); + segments.erase(left_range.getEnd()); + segments.erase(right_range.getEnd()); + id_to_segment.erase(left->segmentId()); + id_to_segment.erase(right->segmentId()); + + segments.emplace(merged->getRowKeyRange().getEnd(), merged); + id_to_segment.emplace(merged->segmentId(), merged); + + if constexpr (DM_RUN_CHECK) + merged->check(dm_context, "After segment merge"); + + LOG_FMT_INFO( + log, + "Merge - Finish, two segments are merged into one, is_foreground={} left={} right={} merged={}", + is_foreground, + dm_context.min_version, + left->info(), + right->info(), + merged->info()); + } + + wbs.writeRemoves(); + + GET_METRIC(tiflash_storage_throughput_bytes, type_merge).Increment(delta_bytes); + GET_METRIC(tiflash_storage_throughput_rows, type_merge).Increment(delta_rows); + + if constexpr (DM_RUN_CHECK) + check(dm_context.db_context); +} + +SegmentPtr DeltaMergeStore::segmentMergeDelta( + DMContext & dm_context, + const SegmentPtr & segment, + const TaskRunThread run_thread, + SegmentSnapshotPtr segment_snap) +{ + LOG_FMT_INFO(log, "MergeDelta - Begin, thread={} safe_point={} segment={}", toString(run_thread), dm_context.min_version, segment->info()); + + ColumnDefinesPtr schema_snap; + + { + std::shared_lock lock(read_write_mutex); + + if (!isSegmentValid(lock, segment)) + { + LOG_FMT_DEBUG(log, "MergeDelta - Give up segmentMergeDelta because segment not valid, segment={}", segment->simpleInfo()); + return {}; + } + + // Try to generate a new snapshot if there is no pre-allocated one + if (!segment_snap) + segment_snap = segment->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfDeltaMerge); + + if (unlikely(!segment_snap)) + { + LOG_FMT_DEBUG(log, "MergeDelta - Give up segmentMergeDelta because snapshot failed, segment={}", segment->simpleInfo()); + return {}; + } + schema_snap = store_columns; + } + + // Not counting the early give up action. + auto delta_bytes = static_cast(segment_snap->delta->getBytes()); + auto delta_rows = static_cast(segment_snap->delta->getRows()); + + CurrentMetrics::Increment cur_dm_segments{CurrentMetrics::DT_DeltaMerge}; + CurrentMetrics::Increment cur_dm_total_bytes{CurrentMetrics::DT_DeltaMergeTotalBytes, static_cast(segment_snap->getBytes())}; + CurrentMetrics::Increment cur_dm_total_rows{CurrentMetrics::DT_DeltaMergeTotalRows, static_cast(segment_snap->getRows())}; + + switch (run_thread) + { + case TaskRunThread::BackgroundThreadPool: + GET_METRIC(tiflash_storage_subtask_count, type_delta_merge).Increment(); + break; + case TaskRunThread::Foreground: + GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_fg).Increment(); + break; + case TaskRunThread::ForegroundRPC: + GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_fg_rpc).Increment(); + break; + case TaskRunThread::BackgroundGCThread: + GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_bg_gc).Increment(); + break; + default: + break; + } + + Stopwatch watch_delta_merge; + SCOPE_EXIT({ + switch (run_thread) + { + case TaskRunThread::BackgroundThreadPool: + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge).Observe(watch_delta_merge.elapsedSeconds()); + break; + case TaskRunThread::Foreground: + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_fg).Observe(watch_delta_merge.elapsedSeconds()); + break; + case TaskRunThread::ForegroundRPC: + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_fg_rpc).Observe(watch_delta_merge.elapsedSeconds()); + break; + case TaskRunThread::BackgroundGCThread: + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_bg_gc).Observe(watch_delta_merge.elapsedSeconds()); + break; + default: + break; + } + }); + + WriteBatches wbs(*storage_pool, dm_context.getWriteLimiter()); + + auto new_stable = segment->prepareMergeDelta(dm_context, schema_snap, segment_snap, wbs); + wbs.writeLogAndData(); + new_stable->enableDMFilesGC(); + + SegmentPtr new_segment; + { + std::unique_lock read_write_lock(read_write_mutex); + + if (!isSegmentValid(read_write_lock, segment)) + { + LOG_FMT_DEBUG(log, "MergeDelta - Give up segmentMergeDelta because segment not valid, segment={}", segment->simpleInfo()); + wbs.setRollback(); + return {}; + } + + auto segment_lock = segment->mustGetUpdateLock(); + + new_segment = segment->applyMergeDelta(dm_context, segment_snap, wbs, new_stable); + + wbs.writeMeta(); + + + // The instance of PKRange::End is closely linked to instance of PKRange. So we cannot reuse it. + // Replace must be done by erase + insert. + segments.erase(segment->getRowKeyRange().getEnd()); + id_to_segment.erase(segment->segmentId()); + + segments[new_segment->getRowKeyRange().getEnd()] = new_segment; + id_to_segment[new_segment->segmentId()] = new_segment; + + segment->abandon(dm_context); + + if constexpr (DM_RUN_CHECK) + { + new_segment->check(dm_context, "After segmentMergeDelta"); + } + + LOG_FMT_INFO(log, "MergeDelta - Finish, delta is merged, old_segment={} new_segment={}", segment->info(), new_segment->info()); + } + + wbs.writeRemoves(); + + GET_METRIC(tiflash_storage_throughput_bytes, type_delta_merge).Increment(delta_bytes); + GET_METRIC(tiflash_storage_throughput_rows, type_delta_merge).Increment(delta_rows); + + if constexpr (DM_RUN_CHECK) + check(dm_context.db_context); + + return new_segment; +} + +bool DeltaMergeStore::doIsSegmentValid(const SegmentPtr & segment) +{ + if (segment->hasAbandoned()) + { + LOG_FMT_DEBUG(log, "Segment instance is abandoned, segment={}", segment->simpleInfo()); + return false; + } + // Segment instance could have been removed or replaced. + auto it = segments.find(segment->getRowKeyRange().getEnd()); + if (it == segments.end()) + { + LOG_FMT_DEBUG(log, "Segment not found in segment map, segment={}", segment->simpleInfo()); + + auto it2 = id_to_segment.find(segment->segmentId()); + if (it2 != id_to_segment.end()) + { + LOG_FMT_DEBUG( + log, + "Found segment with same id in id_to_segment, found_segment={} my_segment={}", + it2->second->info(), + segment->info()); + } + return false; + } + auto & cur_segment = it->second; + if (cur_segment.get() != segment.get()) + { + LOG_FMT_DEBUG(log, "Segment instance has been replaced in segment map, segment={}", segment->simpleInfo()); + return false; + } + return true; +} + +} + +} From 72b9c2835696fdbd41dd0bd56ca631be4a9d985f Mon Sep 17 00:00:00 2001 From: Wish Date: Fri, 9 Sep 2022 11:13:38 +0800 Subject: [PATCH 03/17] Re-format Signed-off-by: Wish --- dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp | 6 +++--- .../Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp index b76ec4c6d64..b2b04a747eb 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp @@ -15,8 +15,8 @@ #include #include #include -#include #include +#include namespace ProfileEvents { @@ -259,5 +259,5 @@ void DeltaMergeStore::ingestFiles( checkSegmentUpdate(dm_context, segment, ThreadType::Write); } -} -} +} // namespace DM +} // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp index 622124bceb4..71a26aebb9e 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp @@ -463,6 +463,6 @@ bool DeltaMergeStore::doIsSegmentValid(const SegmentPtr & segment) return true; } -} +} // namespace DM -} +} // namespace DB From 7de2a21b92ab93ef0cf1f6e229ab04e165919025 Mon Sep 17 00:00:00 2001 From: Wish Date: Fri, 9 Sep 2022 12:38:39 +0800 Subject: [PATCH 04/17] Avoid including DeltaMergeStore.h Signed-off-by: Wish --- dbms/src/Common/TiFlashMetrics.h | 12 +++--- dbms/src/Flash/Management/ManualCompact.cpp | 2 +- .../Storages/DeltaMerge/DeltaMergeStore.cpp | 12 +++--- .../src/Storages/DeltaMerge/DeltaMergeStore.h | 21 +++++----- .../DeltaMerge/DeltaMergeStore_Ingest.cpp | 2 +- .../DeltaMergeStore_InternalSegment.cpp | 40 +++++++++---------- .../tests/gtest_dm_delta_merge_store.cpp | 28 ++++++------- dbms/src/Storages/StorageDeltaMerge.cpp | 4 +- dbms/src/Storages/StorageDeltaMerge.h | 11 ++--- 9 files changed, 63 insertions(+), 69 deletions(-) diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h index 062fd31c5ce..5ab33efec85 100644 --- a/dbms/src/Common/TiFlashMetrics.h +++ b/dbms/src/Common/TiFlashMetrics.h @@ -107,20 +107,20 @@ namespace DB M(tiflash_storage_command_count, "Total number of storage's command, such as delete range / shutdown /startup", Counter, \ F(type_delete_range, {"type", "delete_range"}), F(type_ingest, {"type", "ingest"})) \ M(tiflash_storage_subtask_count, "Total number of storage's sub task", Counter, \ - F(type_delta_merge, {"type", "delta_merge"}), \ - F(type_delta_merge_fg, {"type", "delta_merge_fg"}), \ - F(type_delta_merge_fg_rpc, {"type", "delta_merge_fg_rpc"}), \ + F(type_delta_merge_bg, {"type", "delta_merge_bg"}), \ F(type_delta_merge_bg_gc, {"type", "delta_merge_bg_gc"}), \ + F(type_delta_merge_fg, {"type", "delta_merge_fg"}), \ + F(type_delta_merge_manual, {"type", "delta_merge_manual"}), \ F(type_delta_compact, {"type", "delta_compact"}), \ F(type_delta_flush, {"type", "delta_flush"}), \ F(type_seg_split, {"type", "seg_split"}), F(type_seg_split_fg, {"type", "seg_split_fg"}), \ F(type_seg_merge, {"type", "seg_merge"}), F(type_seg_merge_fg, {"type", "seg_merge_fg"}), \ F(type_place_index_update, {"type", "place_index_update"})) \ M(tiflash_storage_subtask_duration_seconds, "Bucketed histogram of storage's sub task duration", Histogram, \ - F(type_delta_merge, {{"type", "delta_merge"}}, ExpBuckets{0.001, 2, 20}), \ - F(type_delta_merge_fg, {{"type", "delta_merge_fg"}}, ExpBuckets{0.001, 2, 20}), \ - F(type_delta_merge_fg_rpc, {{"type", "delta_merge_fg_rpc"}}, ExpBuckets{0.001, 2, 20}), \ + F(type_delta_merge_bg, {{"type", "delta_merge_bg"}}, ExpBuckets{0.001, 2, 20}), \ F(type_delta_merge_bg_gc, {{"type", "delta_merge_bg_gc"}}, ExpBuckets{0.001, 2, 20}), \ + F(type_delta_merge_fg, {{"type", "delta_merge_fg"}}, ExpBuckets{0.001, 2, 20}), \ + F(type_delta_merge_manual, {{"type", "delta_merge_manual"}}, ExpBuckets{0.001, 2, 20}), \ F(type_delta_compact, {{"type", "delta_compact"}}, ExpBuckets{0.001, 2, 20}), \ F(type_delta_flush, {{"type", "delta_flush"}}, ExpBuckets{0.001, 2, 20}), \ F(type_seg_split, {{"type", "seg_split"}}, ExpBuckets{0.001, 2, 20}), \ diff --git a/dbms/src/Flash/Management/ManualCompact.cpp b/dbms/src/Flash/Management/ManualCompact.cpp index 2143be88cc3..d872ee02fdf 100644 --- a/dbms/src/Flash/Management/ManualCompact.cpp +++ b/dbms/src/Flash/Management/ManualCompact.cpp @@ -167,7 +167,7 @@ grpc::Status ManualCompactManager::doWork(const ::kvrpcpb::CompactRequest * requ // Repeatedly merge multiple segments as much as possible. while (true) { - auto compacted_range = dm_storage->mergeDeltaBySegment(global_context, start_key, DM::DeltaMergeStore::TaskRunThread::ForegroundRPC); + auto compacted_range = dm_storage->mergeDeltaBySegment(global_context, start_key); if (compacted_range == std::nullopt) { diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp index 658ef8d63bc..28efa07fe72 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp @@ -862,13 +862,13 @@ void DeltaMergeStore::mergeDeltaAll(const Context & context) for (auto & segment : all_segments) { - segmentMergeDelta(*dm_context, segment, TaskRunThread::Foreground); + segmentMergeDelta(*dm_context, segment, MergeDeltaReason::Manual); } LOG_FMT_INFO(log, "Finish table mergeDeltaAll"); } -std::optional DeltaMergeStore::mergeDeltaBySegment(const Context & context, const RowKeyValue & start_key, const TaskRunThread run_thread) +std::optional DeltaMergeStore::mergeDeltaBySegment(const Context & context, const RowKeyValue & start_key) { LOG_FMT_INFO(log, "Table mergeDeltaBySegment, start={}", start_key.toDebugString()); @@ -895,7 +895,7 @@ std::optional DeltaMergeStore::mergeDeltaBySegment(const Contex if (segment->flushCache(*dm_context)) { - const auto new_segment = segmentMergeDelta(*dm_context, segment, run_thread); + const auto new_segment = segmentMergeDelta(*dm_context, segment, MergeDeltaReason::Manual); if (new_segment) { const auto segment_end = new_segment->getRowKeyRange().end; @@ -1412,7 +1412,7 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const GET_METRIC(tiflash_storage_write_stall_duration_seconds, type_delta_merge_by_delete_range).Observe(watch.elapsedSeconds()); }); - return segmentMergeDelta(*dm_context, segment, TaskRunThread::Foreground); + return segmentMergeDelta(*dm_context, segment, MergeDeltaReason::ForegroundWrite); } return {}; }; @@ -1579,7 +1579,7 @@ bool DeltaMergeStore::handleBackgroundTask(bool heavy) case TaskType::MergeDelta: { FAIL_POINT_PAUSE(FailPoints::pause_before_dt_background_delta_merge); - left = segmentMergeDelta(*task.dm_context, task.segment, TaskRunThread::BackgroundThreadPool); + left = segmentMergeDelta(*task.dm_context, task.segment, MergeDeltaReason::BackgroundThreadPool); type = ThreadType::BG_MergeDelta; // Wake up all waiting threads if failpoint is enabled FailPointHelper::disableFailPoint(FailPoints::pause_until_dt_background_delta_merge); @@ -1786,7 +1786,7 @@ UInt64 DeltaMergeStore::onSyncGc(Int64 limit) bool finish_gc_on_segment = false; if (should_compact) { - if (segment = segmentMergeDelta(*dm_context, segment, TaskRunThread::BackgroundGCThread, segment_snap); segment) + if (segment = segmentMergeDelta(*dm_context, segment, MergeDeltaReason::BackgroundGCThread, segment_snap); segment) { // Continue to check whether we need to apply more tasks on this segment segment_snap = {}; diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h index 356aff1f775..e8790eb986e 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h @@ -180,13 +180,12 @@ class DeltaMergeStore : private boost::noncopyable PlaceIndex, }; - // TODO: Rename to MergeDeltaThreadType - enum TaskRunThread + enum MergeDeltaReason { BackgroundThreadPool, - Foreground, - ForegroundRPC, BackgroundGCThread, + ForegroundWrite, + Manual, }; static std::string toString(ThreadType type) @@ -237,18 +236,18 @@ class DeltaMergeStore : private boost::noncopyable } } - static std::string toString(TaskRunThread type) + static std::string toString(MergeDeltaReason type) { switch (type) { case BackgroundThreadPool: return "BackgroundThreadPool"; - case Foreground: - return "Foreground"; - case ForegroundRPC: - return "ForegroundRPC"; case BackgroundGCThread: return "BackgroundGCThread"; + case ForegroundWrite: + return "ForegroundWrite"; + case Manual: + return "Manual"; default: return "Unknown"; } @@ -397,7 +396,7 @@ class DeltaMergeStore : private boost::noncopyable /// If there is no segment found by the start key, nullopt is returned. /// /// This function is called when using `ALTER TABLE [TABLE] COMPACT ...` from TiDB. - std::optional mergeDeltaBySegment(const Context & context, const DM::RowKeyValue & start_key, TaskRunThread run_thread); + std::optional mergeDeltaBySegment(const Context & context, const DM::RowKeyValue & start_key); /// Compact the delta layer, merging multiple fragmented delta files into larger ones. /// This is a minor compaction as it does not merge the delta into stable layer. @@ -495,7 +494,7 @@ class DeltaMergeStore : private boost::noncopyable SegmentPtr segmentMergeDelta( DMContext & dm_context, const SegmentPtr & segment, - TaskRunThread thread, + MergeDeltaReason reason, SegmentSnapshotPtr segment_snap = nullptr); bool updateGCSafePoint(); diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp index b2b04a747eb..5c467ddb962 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_Ingest.cpp @@ -195,7 +195,7 @@ void DeltaMergeStore::ingestFiles( updated_segments.push_back(segment); fiu_do_on(FailPoints::segment_merge_after_ingest_packs, { segment->flushCache(*dm_context); - segmentMergeDelta(*dm_context, segment, TaskRunThread::BackgroundThreadPool); + segmentMergeDelta(*dm_context, segment, MergeDeltaReason::ForegroundWrite); storage_pool->gc(global_context.getSettingsRef(), StoragePool::Seconds(0)); }); break; diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp index 71a26aebb9e..3a837e7baa4 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp @@ -302,10 +302,10 @@ void DeltaMergeStore::segmentMerge(DMContext & dm_context, const SegmentPtr & le SegmentPtr DeltaMergeStore::segmentMergeDelta( DMContext & dm_context, const SegmentPtr & segment, - const TaskRunThread run_thread, + const MergeDeltaReason reason, SegmentSnapshotPtr segment_snap) { - LOG_FMT_INFO(log, "MergeDelta - Begin, thread={} safe_point={} segment={}", toString(run_thread), dm_context.min_version, segment->info()); + LOG_FMT_INFO(log, "MergeDelta - Begin, reason={} safe_point={} segment={}", toString(reason), dm_context.min_version, segment->info()); ColumnDefinesPtr schema_snap; @@ -338,19 +338,19 @@ SegmentPtr DeltaMergeStore::segmentMergeDelta( CurrentMetrics::Increment cur_dm_total_bytes{CurrentMetrics::DT_DeltaMergeTotalBytes, static_cast(segment_snap->getBytes())}; CurrentMetrics::Increment cur_dm_total_rows{CurrentMetrics::DT_DeltaMergeTotalRows, static_cast(segment_snap->getRows())}; - switch (run_thread) + switch (reason) { - case TaskRunThread::BackgroundThreadPool: - GET_METRIC(tiflash_storage_subtask_count, type_delta_merge).Increment(); + case MergeDeltaReason::BackgroundThreadPool: + GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_bg).Increment(); break; - case TaskRunThread::Foreground: - GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_fg).Increment(); + case MergeDeltaReason::BackgroundGCThread: + GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_bg_gc).Increment(); break; - case TaskRunThread::ForegroundRPC: - GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_fg_rpc).Increment(); + case MergeDeltaReason::ForegroundWrite: + GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_fg).Increment(); break; - case TaskRunThread::BackgroundGCThread: - GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_bg_gc).Increment(); + case MergeDeltaReason::Manual: + GET_METRIC(tiflash_storage_subtask_count, type_delta_merge_manual).Increment(); break; default: break; @@ -358,19 +358,19 @@ SegmentPtr DeltaMergeStore::segmentMergeDelta( Stopwatch watch_delta_merge; SCOPE_EXIT({ - switch (run_thread) + switch (reason) { - case TaskRunThread::BackgroundThreadPool: - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge).Observe(watch_delta_merge.elapsedSeconds()); + case MergeDeltaReason::BackgroundThreadPool: + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_bg).Observe(watch_delta_merge.elapsedSeconds()); break; - case TaskRunThread::Foreground: - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_fg).Observe(watch_delta_merge.elapsedSeconds()); + case MergeDeltaReason::BackgroundGCThread: + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_bg_gc).Observe(watch_delta_merge.elapsedSeconds()); break; - case TaskRunThread::ForegroundRPC: - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_fg_rpc).Observe(watch_delta_merge.elapsedSeconds()); + case MergeDeltaReason::ForegroundWrite: + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_fg).Observe(watch_delta_merge.elapsedSeconds()); break; - case TaskRunThread::BackgroundGCThread: - GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_bg_gc).Observe(watch_delta_merge.elapsedSeconds()); + case MergeDeltaReason::Manual: + GET_METRIC(tiflash_storage_subtask_duration_seconds, type_delta_merge_manual).Observe(watch_delta_merge.elapsedSeconds()); break; default: break; diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp index f3d1daa739a..6b7f774bd5a 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp @@ -3095,13 +3095,13 @@ try if (store->isCommonHandle()) { // Specifies MAX_KEY. nullopt should be returned. - auto result = store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MAX_KEY, DeltaMergeStore::TaskRunThread::Foreground); + auto result = store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MAX_KEY, DeltaMergeStore::MergeDeltaReason::Manual); ASSERT_EQ(result, std::nullopt); } else { // Specifies MAX_KEY. nullopt should be returned. - auto result = store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MAX_KEY, DeltaMergeStore::TaskRunThread::Foreground); + auto result = store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MAX_KEY, DeltaMergeStore::MergeDeltaReason::Manual); ASSERT_EQ(result, std::nullopt); } std::optional result_1; @@ -3109,11 +3109,11 @@ try // Specifies MIN_KEY. In this case, the first segment should be processed. if (store->isCommonHandle()) { - result_1 = store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MIN_KEY, DeltaMergeStore::TaskRunThread::Foreground); + result_1 = store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MIN_KEY, DeltaMergeStore::MergeDeltaReason::Manual); } else { - result_1 = store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MIN_KEY, DeltaMergeStore::TaskRunThread::Foreground); + result_1 = store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MIN_KEY, DeltaMergeStore::MergeDeltaReason::Manual); } // The returned range is the same as first segment's range. ASSERT_NE(result_1, std::nullopt); @@ -3125,7 +3125,7 @@ try } { // Compact the first segment again, nothing should change. - auto result = store->mergeDeltaBySegment(*db_context, result_1->start, DeltaMergeStore::TaskRunThread::Foreground); + auto result = store->mergeDeltaBySegment(*db_context, result_1->start, DeltaMergeStore::MergeDeltaReason::Manual); ASSERT_EQ(*result, *result_1); helper->verifyExpectedRowsForAllSegments(); @@ -3133,7 +3133,7 @@ try std::optional result_2; { // Compact again using the end key just returned. The second segment should be processed. - result_2 = store->mergeDeltaBySegment(*db_context, result_1->end, DeltaMergeStore::TaskRunThread::Foreground); + result_2 = store->mergeDeltaBySegment(*db_context, result_1->end, DeltaMergeStore::MergeDeltaReason::Manual); ASSERT_NE(result_2, std::nullopt); ASSERT_EQ(*result_2, std::next(store->segments.begin())->second->getRowKeyRange()); @@ -3151,12 +3151,12 @@ TEST_P(DeltaMergeStoreMergeDeltaBySegmentTest, InvalidKey) if (store->isCommonHandle()) { // For common handle, give int handle key and have a try - store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MIN_KEY, DeltaMergeStore::TaskRunThread::Foreground); + store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MIN_KEY, DeltaMergeStore::MergeDeltaReason::Manual); } else { // For int handle, give common handle key and have a try - store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MIN_KEY, DeltaMergeStore::TaskRunThread::Foreground); + store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MIN_KEY, DeltaMergeStore::MergeDeltaReason::Manual); } }); } @@ -3172,13 +3172,13 @@ try ASSERT_NE(it, store->segments.end()); auto seg = it->second; - result = store->mergeDeltaBySegment(*db_context, seg->getRowKeyRange().start, DeltaMergeStore::TaskRunThread::Foreground); + result = store->mergeDeltaBySegment(*db_context, seg->getRowKeyRange().start, DeltaMergeStore::MergeDeltaReason::Manual); ASSERT_NE(result, std::nullopt); helper->verifyExpectedRowsForAllSegments(); } { // As we are the last segment, compact "next segment" should result in failure. A nullopt is returned. - auto result2 = store->mergeDeltaBySegment(*db_context, result->end, DeltaMergeStore::TaskRunThread::Foreground); + auto result2 = store->mergeDeltaBySegment(*db_context, result->end, DeltaMergeStore::MergeDeltaReason::Manual); ASSERT_EQ(result2, std::nullopt); helper->verifyExpectedRowsForAllSegments(); } @@ -3207,7 +3207,7 @@ try auto range = std::next(store->segments.begin())->second->getRowKeyRange(); auto compact_key = range.start.toPrefixNext(); - auto result = store->mergeDeltaBySegment(*db_context, compact_key, DeltaMergeStore::TaskRunThread::Foreground); + auto result = store->mergeDeltaBySegment(*db_context, compact_key, DeltaMergeStore::MergeDeltaReason::Manual); ASSERT_NE(result, std::nullopt); helper->expected_stable_rows[1] += helper->expected_delta_rows[1]; @@ -3251,7 +3251,7 @@ try } { auto segment1 = std::next(store->segments.begin())->second; - auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::TaskRunThread::Foreground); + auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::MergeDeltaReason::Manual); ASSERT_NE(result, std::nullopt); segment1 = std::next(store->segments.begin())->second; @@ -3299,7 +3299,7 @@ try // Start a mergeDelta. It should hit retry immediately due to a flush is in progress. auto th_merge_delta = std::async([&]() { auto segment1 = std::next(store->segments.begin())->second; - auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::TaskRunThread::Foreground); + auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::MergeDeltaReason::Manual); ASSERT_NE(result, std::nullopt); // All rows in the delta layer should be merged into the stable layer. helper->expected_stable_rows[1] += helper->expected_delta_rows[1]; @@ -3350,7 +3350,7 @@ try auto th_merge_delta = std::async([&] { // mergeDeltaBySegment for segment1 auto segment1 = std::next(store->segments.begin())->second; - auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::TaskRunThread::Foreground); + auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::MergeDeltaReason::Manual); ASSERT_NE(result, std::nullopt); // Although original segment1 has been split into 2, we still expect only segment1's delta diff --git a/dbms/src/Storages/StorageDeltaMerge.cpp b/dbms/src/Storages/StorageDeltaMerge.cpp index 55e8348861a..03289f76839 100644 --- a/dbms/src/Storages/StorageDeltaMerge.cpp +++ b/dbms/src/Storages/StorageDeltaMerge.cpp @@ -789,9 +789,9 @@ void StorageDeltaMerge::mergeDelta(const Context & context) getAndMaybeInitStore()->mergeDeltaAll(context); } -std::optional StorageDeltaMerge::mergeDeltaBySegment(const Context & context, const DM::RowKeyValue & start_key, const DM::DeltaMergeStore::TaskRunThread run_thread) +std::optional StorageDeltaMerge::mergeDeltaBySegment(const Context & context, const DM::RowKeyValue & start_key) { - return getAndMaybeInitStore()->mergeDeltaBySegment(context, start_key, run_thread); + return getAndMaybeInitStore()->mergeDeltaBySegment(context, start_key); } void StorageDeltaMerge::deleteRange(const DM::RowKeyRange & range_to_delete, const Settings & settings) diff --git a/dbms/src/Storages/StorageDeltaMerge.h b/dbms/src/Storages/StorageDeltaMerge.h index 4fb1f3032da..886e198d336 100644 --- a/dbms/src/Storages/StorageDeltaMerge.h +++ b/dbms/src/Storages/StorageDeltaMerge.h @@ -14,12 +14,11 @@ #pragma once +#include #include #include #include #include -#include -#include #include #include #include @@ -27,16 +26,12 @@ #include -namespace Poco -{ -class Logger; -} // namespace Poco - namespace DB { namespace DM { struct RowKeyRange; +struct RowKeyValue; class DeltaMergeStore; using DeltaMergeStorePtr = std::shared_ptr; } // namespace DM @@ -85,7 +80,7 @@ class StorageDeltaMerge /// If there is no segment found by the start key, nullopt is returned. /// /// This function is called when using `ALTER TABLE [TABLE] COMPACT ...` from TiDB. - std::optional mergeDeltaBySegment(const Context & context, const DM::RowKeyValue & start_key, const DM::DeltaMergeStore::TaskRunThread run_thread); + std::optional mergeDeltaBySegment(const Context & context, const DM::RowKeyValue & start_key); void deleteRange(const DM::RowKeyRange & range_to_delete, const Settings & settings); From 366cb810d5781eff16f10c1447115174106f72c4 Mon Sep 17 00:00:00 2001 From: Wish Date: Fri, 9 Sep 2022 12:40:38 +0800 Subject: [PATCH 05/17] Fix (1) Signed-off-by: Wish --- .../tests/gtest_dm_delta_merge_store.cpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp index 6b7f774bd5a..bca226c8b7b 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp @@ -3095,13 +3095,13 @@ try if (store->isCommonHandle()) { // Specifies MAX_KEY. nullopt should be returned. - auto result = store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MAX_KEY, DeltaMergeStore::MergeDeltaReason::Manual); + auto result = store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MAX_KEY); ASSERT_EQ(result, std::nullopt); } else { // Specifies MAX_KEY. nullopt should be returned. - auto result = store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MAX_KEY, DeltaMergeStore::MergeDeltaReason::Manual); + auto result = store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MAX_KEY); ASSERT_EQ(result, std::nullopt); } std::optional result_1; @@ -3109,11 +3109,11 @@ try // Specifies MIN_KEY. In this case, the first segment should be processed. if (store->isCommonHandle()) { - result_1 = store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MIN_KEY, DeltaMergeStore::MergeDeltaReason::Manual); + result_1 = store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MIN_KEY); } else { - result_1 = store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MIN_KEY, DeltaMergeStore::MergeDeltaReason::Manual); + result_1 = store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MIN_KEY); } // The returned range is the same as first segment's range. ASSERT_NE(result_1, std::nullopt); @@ -3125,7 +3125,7 @@ try } { // Compact the first segment again, nothing should change. - auto result = store->mergeDeltaBySegment(*db_context, result_1->start, DeltaMergeStore::MergeDeltaReason::Manual); + auto result = store->mergeDeltaBySegment(*db_context, result_1->start); ASSERT_EQ(*result, *result_1); helper->verifyExpectedRowsForAllSegments(); @@ -3133,7 +3133,7 @@ try std::optional result_2; { // Compact again using the end key just returned. The second segment should be processed. - result_2 = store->mergeDeltaBySegment(*db_context, result_1->end, DeltaMergeStore::MergeDeltaReason::Manual); + result_2 = store->mergeDeltaBySegment(*db_context, result_1->end); ASSERT_NE(result_2, std::nullopt); ASSERT_EQ(*result_2, std::next(store->segments.begin())->second->getRowKeyRange()); @@ -3151,12 +3151,12 @@ TEST_P(DeltaMergeStoreMergeDeltaBySegmentTest, InvalidKey) if (store->isCommonHandle()) { // For common handle, give int handle key and have a try - store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MIN_KEY, DeltaMergeStore::MergeDeltaReason::Manual); + store->mergeDeltaBySegment(*db_context, RowKeyValue::INT_HANDLE_MIN_KEY); } else { // For int handle, give common handle key and have a try - store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MIN_KEY, DeltaMergeStore::MergeDeltaReason::Manual); + store->mergeDeltaBySegment(*db_context, RowKeyValue::COMMON_HANDLE_MIN_KEY); } }); } @@ -3172,13 +3172,13 @@ try ASSERT_NE(it, store->segments.end()); auto seg = it->second; - result = store->mergeDeltaBySegment(*db_context, seg->getRowKeyRange().start, DeltaMergeStore::MergeDeltaReason::Manual); + result = store->mergeDeltaBySegment(*db_context, seg->getRowKeyRange().start); ASSERT_NE(result, std::nullopt); helper->verifyExpectedRowsForAllSegments(); } { // As we are the last segment, compact "next segment" should result in failure. A nullopt is returned. - auto result2 = store->mergeDeltaBySegment(*db_context, result->end, DeltaMergeStore::MergeDeltaReason::Manual); + auto result2 = store->mergeDeltaBySegment(*db_context, result->end); ASSERT_EQ(result2, std::nullopt); helper->verifyExpectedRowsForAllSegments(); } @@ -3207,7 +3207,7 @@ try auto range = std::next(store->segments.begin())->second->getRowKeyRange(); auto compact_key = range.start.toPrefixNext(); - auto result = store->mergeDeltaBySegment(*db_context, compact_key, DeltaMergeStore::MergeDeltaReason::Manual); + auto result = store->mergeDeltaBySegment(*db_context, compact_key); ASSERT_NE(result, std::nullopt); helper->expected_stable_rows[1] += helper->expected_delta_rows[1]; @@ -3251,7 +3251,7 @@ try } { auto segment1 = std::next(store->segments.begin())->second; - auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::MergeDeltaReason::Manual); + auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start); ASSERT_NE(result, std::nullopt); segment1 = std::next(store->segments.begin())->second; @@ -3299,7 +3299,7 @@ try // Start a mergeDelta. It should hit retry immediately due to a flush is in progress. auto th_merge_delta = std::async([&]() { auto segment1 = std::next(store->segments.begin())->second; - auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::MergeDeltaReason::Manual); + auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start); ASSERT_NE(result, std::nullopt); // All rows in the delta layer should be merged into the stable layer. helper->expected_stable_rows[1] += helper->expected_delta_rows[1]; @@ -3350,7 +3350,7 @@ try auto th_merge_delta = std::async([&] { // mergeDeltaBySegment for segment1 auto segment1 = std::next(store->segments.begin())->second; - auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start, DeltaMergeStore::MergeDeltaReason::Manual); + auto result = store->mergeDeltaBySegment(*db_context, segment1->getRowKeyRange().start); ASSERT_NE(result, std::nullopt); // Although original segment1 has been split into 2, we still expect only segment1's delta From 47ea426ea6eaf76147b971895eee2fa7328125d5 Mon Sep 17 00:00:00 2001 From: Wish Date: Mon, 12 Sep 2022 15:21:50 +0800 Subject: [PATCH 06/17] wip --- .../Storages/DeltaMerge/DeltaMergeStore.cpp | 2 +- .../src/Storages/DeltaMerge/DeltaMergeStore.h | 40 ++-- .../DeltaMergeStore_InternalSegment.cpp | 132 ++++++----- dbms/src/Storages/DeltaMerge/Segment.cpp | 178 +++++++++------ dbms/src/Storages/DeltaMerge/Segment.h | 51 +++-- .../DeltaMerge/tests/gtest_dm_segment.cpp | 52 ++++- .../tests/gtest_dm_segment_common_handle.cpp | 2 +- .../DeltaMerge/tests/gtest_segment.cpp | 14 +- .../tests/gtest_segment_test_basic.cpp | 216 +++++++++--------- .../tests/gtest_segment_test_basic.h | 11 +- 10 files changed, 417 insertions(+), 281 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp index 28efa07fe72..0f1593573ab 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp @@ -1573,7 +1573,7 @@ bool DeltaMergeStore::handleBackgroundTask(bool heavy) type = ThreadType::BG_Split; break; case TaskType::Merge: - segmentMerge(*task.dm_context, task.segment, task.next_segment, false); + segmentMerge(*task.dm_context, {task.segment, task.next_segment}, false); type = ThreadType::BG_Merge; break; case TaskType::MergeDelta: diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h index e8790eb986e..7d4f9a6c1d9 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h @@ -472,25 +472,35 @@ class DeltaMergeStore : private boost::noncopyable void waitForDeleteRange(const DMContextPtr & context, const SegmentPtr & segment); - /// Try to update the segment. "Update" means splitting the segment into two, merging two segments, merging the delta, etc. - /// If an update is really performed, the segment will be abandoned (with `segment->hasAbandoned() == true`). - /// See `segmentSplit`, `segmentMerge`, `segmentMergeDelta` for details. - /// - /// This may be called from multiple threads, e.g. at the foreground write moment, or in background threads. - /// A `thread_type` should be specified indicating the type of the thread calling this function. - /// Depend on the thread type, the "update" to do may be varied. + /** + * Try to update the segment. "Update" means splitting the segment into two, merging two segments, merging the delta, etc. + * If an update is really performed, the segment will be abandoned (with `segment->hasAbandoned() == true`). + * See `segmentSplit`, `segmentMerge`, `segmentMergeDelta` for details. + * + * This may be called from multiple threads, e.g. at the foreground write moment, or in background threads. + * A `thread_type` should be specified indicating the type of the thread calling this function. + * Depend on the thread type, the "update" to do may be varied. + */ void checkSegmentUpdate(const DMContextPtr & context, const SegmentPtr & segment, ThreadType thread_type); - /// Split the segment into two. - /// After splitting, the segment will be abandoned (with `segment->hasAbandoned() == true`) and the new two segments will be returned. + /** + * Split the segment into two. + * After splitting, the segment will be abandoned (with `segment->hasAbandoned() == true`) and the new two segments will be returned. + */ SegmentPair segmentSplit(DMContext & dm_context, const SegmentPtr & segment, bool is_foreground); - /// Merge two segments into one. - /// After merging, both segments will be abandoned (with `segment->hasAbandoned() == true`). - void segmentMerge(DMContext & dm_context, const SegmentPtr & left, const SegmentPtr & right, bool is_foreground); - - /// Merge the delta (major compaction) in the segment. - /// After delta-merging, the segment will be abandoned (with `segment->hasAbandoned() == true`) and a new segment will be returned. + /** + * Merge multiple continuous segments (order by segment start key) into one. + * Throw exception if < 2 segments are given. + * Fail if given segments are not continuous or not valid. + * After merging, all specified segments will be abandoned (with `segment->hasAbandoned() == true`). + */ + SegmentPtr segmentMerge(DMContext & dm_context, const std::vector & ordered_segments, bool is_foreground); + + /** + * Merge the delta (major compaction) in the segment. + * After delta-merging, the segment will be abandoned (with `segment->hasAbandoned() == true`) and a new segment will be returned. + */ SegmentPtr segmentMergeDelta( DMContext & dm_context, const SegmentPtr & segment, diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp index 3a837e7baa4..8a74b3e1372 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp @@ -163,71 +163,85 @@ SegmentPair DeltaMergeStore::segmentSplit(DMContext & dm_context, const SegmentP return {new_left, new_right}; } -void DeltaMergeStore::segmentMerge(DMContext & dm_context, const SegmentPtr & left, const SegmentPtr & right, bool is_foreground) +SegmentPtr DeltaMergeStore::segmentMerge(DMContext & dm_context, const std::vector & ordered_segments, bool is_foreground) { + RUNTIME_CHECK(ordered_segments.size() >= 2, ordered_segments.size()); + LOG_FMT_INFO( log, - "Merge - Begin, is_foreground={} safe_point={} left={} right={}", + "Merge - Begin, is_foreground={} safe_point={} segments_to_merge={}", is_foreground, dm_context.min_version, - left->info(), - right->info()); + Segment::simpleInfo(ordered_segments)); /// This segment may contain some rows that not belong to this segment range which is left by previous split operation. /// And only saved data in this segment will be filtered by the segment range in the merge process, /// unsaved data will be directly copied to the new segment. /// So we flush here to make sure that all potential data left by previous split operation is saved. - while (!left->flushCache(dm_context)) - { - // keep flush until success if not abandoned - if (left->hasAbandoned()) - { - LOG_FMT_INFO(log, "Merge - Give up segmentMerge because left abandoned, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return; - } - } - while (!right->flushCache(dm_context)) + for (const auto & seg : ordered_segments) { - // keep flush until success if not abandoned - if (right->hasAbandoned()) + size_t sleep_ms = 5; + while (true) { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because right abandoned, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return; + if (seg->hasAbandoned()) + { + LOG_FMT_INFO(log, "Merge - Give up segmentMerge because segment abandoned, segment={}", seg->simpleInfo()); + return {}; + } + + if (seg->flushCache(dm_context)) + break; + + // Else: retry. Flush could fail. Typical cases: + // #1. The segment is abandoned (due to an update is finished) + // #2. There is another flush in progress, for example, triggered in background + // Let's sleep 5ms ~ 100ms and then retry flush again. + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); + sleep_ms = std::min(sleep_ms * 2, 100); } } - SegmentSnapshotPtr left_snap; - SegmentSnapshotPtr right_snap; + std::vector ordered_snapshots; + ordered_snapshots.reserve(ordered_segments.size()); ColumnDefinesPtr schema_snap; { std::shared_lock lock(read_write_mutex); - if (!isSegmentValid(lock, left)) + for (const auto & seg : ordered_segments) { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because left not valid, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return; + if (!isSegmentValid(lock, seg)) + { + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because not valid, segment={}", seg->info()); + return {}; + } } - if (!isSegmentValid(lock, right)) + + for (const auto & seg : ordered_segments) { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because right not valid, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return; - } + // TODO: Should we ensure the ordering of "segments" first? - left_snap = left->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentMerge); - right_snap = right->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentMerge); + auto snap = seg->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentMerge); + if (!snap) + { + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because snapshot failed, segment={}", seg->info()); + return {}; + } - if (!left_snap || !right_snap) - { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because snapshot failed, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return; + ordered_snapshots.emplace_back(snap); } + schema_snap = store_columns; } // Not counting the early give up action. - auto delta_bytes = static_cast(left_snap->delta->getBytes()) + right_snap->delta->getBytes(); - auto delta_rows = static_cast(left_snap->delta->getRows()) + right_snap->delta->getRows(); + Int64 delta_bytes = 0; + Int64 delta_rows = 0; + for (const auto & snap : ordered_snapshots) + { + delta_bytes += static_cast(snap->delta->getBytes()); + delta_rows += static_cast(snap->delta->getRows()); + } CurrentMetrics::Increment cur_dm_segments{CurrentMetrics::DT_SegmentMerge}; if (is_foreground) @@ -242,37 +256,40 @@ void DeltaMergeStore::segmentMerge(DMContext & dm_context, const SegmentPtr & le GET_METRIC(tiflash_storage_subtask_duration_seconds, type_seg_merge).Observe(watch_seg_merge.elapsedSeconds()); }); - auto left_range = left->getRowKeyRange(); - auto right_range = right->getRowKeyRange(); - WriteBatches wbs(*storage_pool, dm_context.getWriteLimiter()); - auto merged_stable = Segment::prepareMerge(dm_context, schema_snap, left, left_snap, right, right_snap, wbs); + auto merged_stable = Segment::prepareMerge(dm_context, schema_snap, ordered_segments, ordered_snapshots, wbs); wbs.writeLogAndData(); merged_stable->enableDMFilesGC(); + SegmentPtr merged; { std::unique_lock lock(read_write_mutex); - if (!isSegmentValid(lock, left) || !isSegmentValid(lock, right)) + for (const auto & seg : ordered_segments) { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because left or right not valid, left={} right={}", left->simpleInfo(), right->simpleInfo()); - wbs.setRollback(); - return; + if (!isSegmentValid(lock, seg)) + { + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because not valid, segment={}", seg->info()); + wbs.setRollback(); + return {}; + } } - auto left_lock = left->mustGetUpdateLock(); - auto right_lock = right->mustGetUpdateLock(); + std::vector locks; + locks.reserve(ordered_segments.size()); + for (const auto & seg : ordered_segments) + locks.emplace_back(seg->mustGetUpdateLock()); - auto merged = Segment::applyMerge(dm_context, left, left_snap, right, right_snap, wbs, merged_stable); + merged = Segment::applyMerge(dm_context, ordered_segments, ordered_snapshots, wbs, merged_stable); wbs.writeMeta(); - left->abandon(dm_context); - right->abandon(dm_context); - segments.erase(left_range.getEnd()); - segments.erase(right_range.getEnd()); - id_to_segment.erase(left->segmentId()); - id_to_segment.erase(right->segmentId()); + for (const auto & seg : ordered_segments) + { + seg->abandon(dm_context); + segments.erase(seg->getRowKeyRange().getEnd()); + id_to_segment.erase(seg->segmentId()); + } segments.emplace(merged->getRowKeyRange().getEnd(), merged); id_to_segment.emplace(merged->segmentId(), merged); @@ -282,12 +299,11 @@ void DeltaMergeStore::segmentMerge(DMContext & dm_context, const SegmentPtr & le LOG_FMT_INFO( log, - "Merge - Finish, two segments are merged into one, is_foreground={} left={} right={} merged={}", + "Merge - Finish, {} segments are merged into one, is_foreground={} merged={} segments_to_merge={}", + ordered_segments.size(), is_foreground, - dm_context.min_version, - left->info(), - right->info(), - merged->info()); + merged->info(), + Segment::info(ordered_segments)); } wbs.writeRemoves(); @@ -297,6 +313,8 @@ void DeltaMergeStore::segmentMerge(DMContext & dm_context, const SegmentPtr & le if constexpr (DM_RUN_CHECK) check(dm_context.db_context); + + return merged; } SegmentPtr DeltaMergeStore::segmentMergeDelta( diff --git a/dbms/src/Storages/DeltaMerge/Segment.cpp b/dbms/src/Storages/DeltaMerge/Segment.cpp index 11a386626b6..78cc9e936df 100644 --- a/dbms/src/Storages/DeltaMerge/Segment.cpp +++ b/dbms/src/Storages/DeltaMerge/Segment.cpp @@ -1197,49 +1197,50 @@ SegmentPair Segment::applySplit(DMContext & dm_context, // return {new_me, other}; } -SegmentPtr Segment::merge(DMContext & dm_context, const ColumnDefinesPtr & schema_snap, const SegmentPtr & left, const SegmentPtr & right) +SegmentPtr Segment::merge(DMContext & dm_context, const ColumnDefinesPtr & schema_snap, const std::vector & ordered_segments) { WriteBatches wbs(dm_context.storage_pool, dm_context.getWriteLimiter()); /// This segment may contain some rows that not belong to this segment range which is left by previous split operation. /// And only saved data in this segment will be filtered by the segment range in the merge process, /// unsaved data will be directly copied to the new segment. /// So we flush here to make sure that all potential data left by previous split operation is saved. - while (!left->flushCache(dm_context)) + for (const auto & seg : ordered_segments) { - // keep flush until success if not abandoned - if (left->hasAbandoned()) + while (!seg->flushCache(dm_context)) { - LOG_FMT_DEBUG(left->log, "Merge - Give up segmentMerge because left abandoned, left={} right={}", left->simpleInfo(), right->simpleInfo()); - return {}; + // keep flush until success if not abandoned + if (seg->hasAbandoned()) + { + LOG_FMT_DEBUG(seg->log, "Merge - Give up segmentMerge because abandoned, seg={}", seg->simpleInfo()); + return {}; + } } } - while (!right->flushCache(dm_context)) + + std::vector ordered_snapshots; + for (const auto & seg : ordered_segments) { - // keep flush until success if not abandoned - if (right->hasAbandoned()) + auto snap = seg->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentMerge); + if (!snap) { - LOG_FMT_DEBUG(left->log, "Merge - Give up segmentMerge because right abandoned, left={} right={}", left->simpleInfo(), right->simpleInfo()); + LOG_FMT_DEBUG(seg->log, "Merge - Give up segmentMerge because snapshot failed, seg={}", seg->simpleInfo()); return {}; } + ordered_snapshots.emplace_back(snap); } - - auto left_snap = left->createSnapshot(dm_context, true, CurrentMetrics::DT_SnapshotOfSegmentMerge); - auto right_snap = right->createSnapshot(dm_context, true, CurrentMetrics::DT_SnapshotOfSegmentMerge); - if (!left_snap || !right_snap) - return {}; - - auto merged_stable = prepareMerge(dm_context, schema_snap, left, left_snap, right, right_snap, wbs); + auto merged_stable = prepareMerge(dm_context, schema_snap, ordered_segments, ordered_snapshots, wbs); wbs.writeLogAndData(); merged_stable->enableDMFilesGC(); SYNC_FOR("before_Segment::applyMerge"); // pause without holding the lock on segments to be merged - auto left_lock = left->mustGetUpdateLock(); - auto right_lock = right->mustGetUpdateLock(); + std::vector locks; + for (const auto & seg : ordered_segments) + locks.emplace_back(seg->mustGetUpdateLock()); - auto merged = applyMerge(dm_context, left, left_snap, right, right_snap, wbs, merged_stable); + auto merged = applyMerge(dm_context, ordered_segments, ordered_snapshots, wbs, merged_stable); wbs.writeAll(); return merged; @@ -1251,26 +1252,36 @@ SegmentPtr Segment::merge(DMContext & dm_context, const ColumnDefinesPtr & schem /// So remember to do a flush for the segments before merge. StableValueSpacePtr Segment::prepareMerge(DMContext & dm_context, // const ColumnDefinesPtr & schema_snap, - const SegmentPtr & left, - const SegmentSnapshotPtr & left_snap, - const SegmentPtr & right, - const SegmentSnapshotPtr & right_snap, + const std::vector & ordered_segments, + const std::vector & ordered_snapshots, WriteBatches & wbs) { - LOG_FMT_DEBUG(left->log, "Merge - Begin prepare, left={} right={}", left->simpleInfo(), right->simpleInfo()); + RUNTIME_CHECK(ordered_segments.size() >= 2, ordered_snapshots.size()); + RUNTIME_CHECK(ordered_segments.size() == ordered_snapshots.size(), ordered_segments.size(), ordered_snapshots.size()); - if (unlikely(compare(left->rowkey_range.getEnd(), right->rowkey_range.getStart()) != 0 || left->next_segment_id != right->segment_id)) - throw Exception( - fmt::format("The ranges of merge segments are not consecutive: leftEnd={} rightStart={}", - left->rowkey_range.getEnd().toDebugString(), - right->rowkey_range.getStart().toDebugString())); + const auto & log = ordered_segments[0]->log; + LOG_FMT_DEBUG(log, "Merge - Begin prepare, segments_to_merge={}", simpleInfo(ordered_segments)); + + for (size_t i = 1; i < ordered_segments.size(); i++) + { + RUNTIME_CHECK( + compare(ordered_segments[i - 1]->rowkey_range.getEnd(), ordered_segments[i]->rowkey_range.getStart()) == 0, + i, + ordered_segments[i - 1]->info(), + ordered_segments[i]->info()); + RUNTIME_CHECK( + ordered_segments[i - 1]->next_segment_id == ordered_segments[i]->segment_id, + i, + ordered_segments[i - 1]->info(), + ordered_segments[i]->info()); + } auto get_stream = [&](const SegmentPtr & segment, const SegmentSnapshotPtr & segment_snap) { auto read_info = segment->getReadInfo( dm_context, *schema_snap, segment_snap, - {RowKeyRange::newAll(left->is_common_handle, left->rowkey_column_size)}); + {RowKeyRange::newAll(segment->is_common_handle, segment->rowkey_column_size)}); RowKeyRanges rowkey_ranges{segment->rowkey_range}; BlockInputStreamPtr stream = getPlacedStream(dm_context, *read_info.read_columns, @@ -1293,10 +1304,12 @@ StableValueSpacePtr Segment::prepareMerge(DMContext & dm_context, // return stream; }; - auto left_stream = get_stream(left, left_snap); - auto right_stream = get_stream(right, right_snap); + std::vector input_streams; + input_streams.reserve(ordered_segments.size()); + for (size_t i = 0; i < ordered_segments.size(); i++) + input_streams.emplace_back(get_stream(ordered_segments[i], ordered_snapshots[i])); - BlockInputStreamPtr merged_stream = std::make_shared(BlockInputStreams{left_stream, right_stream}, /*req_id=*/""); + BlockInputStreamPtr merged_stream = std::make_shared(input_streams, /*req_id=*/""); // for the purpose to calculate StableProperty of the new segment merged_stream = std::make_shared>( merged_stream, @@ -1304,44 +1317,48 @@ StableValueSpacePtr Segment::prepareMerge(DMContext & dm_context, // dm_context.min_version, dm_context.is_common_handle); - auto merged_stable_id = left->stable->getId(); + auto merged_stable_id = ordered_segments[0]->stable->getId(); auto merged_stable = createNewStable(dm_context, schema_snap, merged_stream, merged_stable_id, wbs); - LOG_FMT_DEBUG(left->log, "Merge - Finish prepare, left={} right={}", left->info(), right->info()); + LOG_FMT_DEBUG(log, "Merge - Finish prepare, segments_to_merge={}", info(ordered_segments)); return merged_stable; } SegmentPtr Segment::applyMerge(DMContext & dm_context, // - const SegmentPtr & left, - const SegmentSnapshotPtr & left_snap, - const SegmentPtr & right, - const SegmentSnapshotPtr & right_snap, + const std::vector & ordered_segments, + const std::vector & ordered_snapshots, WriteBatches & wbs, const StableValueSpacePtr & merged_stable) { - LOG_FMT_DEBUG(left->log, "Merge - Begin apply, left={} right={}", left->simpleInfo(), right->simpleInfo()); + RUNTIME_CHECK(ordered_segments.size() >= 2, ordered_snapshots.size()); + RUNTIME_CHECK(ordered_segments.size() == ordered_snapshots.size(), ordered_segments.size(), ordered_snapshots.size()); - RowKeyRange merged_range(left->rowkey_range.start, right->rowkey_range.end, left->is_common_handle, left->rowkey_column_size); + const auto & first_seg = ordered_segments.front(); + const auto & last_seg = ordered_segments.back(); + const auto & log = first_seg->log; + LOG_FMT_DEBUG(log, "Merge - Begin apply, segments_to_merge={}", simpleInfo(ordered_segments)); - auto [left_persisted_files, left_in_memory_files] = left->delta->checkHeadAndCloneTail(dm_context, merged_range, left_snap->delta->getColumnFilesInSnapshot(), wbs); - auto [right_persisted_files, right_in_memory_files] = right->delta->checkHeadAndCloneTail(dm_context, merged_range, right_snap->delta->getColumnFilesInSnapshot(), wbs); + RowKeyRange merged_range(first_seg->rowkey_range.start, last_seg->rowkey_range.end, first_seg->is_common_handle, first_seg->rowkey_column_size); + + ColumnFilePersisteds merged_persisted_column_files; + ColumnFiles merged_in_memory_files; + for (size_t i = 0; i < ordered_segments.size(); i++) + { + const auto [persisted_files, in_memory_files] = ordered_segments[i]->delta->checkHeadAndCloneTail(dm_context, merged_range, ordered_snapshots[i]->delta->getColumnFilesInSnapshot(), wbs); + merged_persisted_column_files.insert(merged_persisted_column_files.end(), persisted_files.begin(), persisted_files.end()); + merged_in_memory_files.insert(merged_in_memory_files.end(), in_memory_files.begin(), in_memory_files.end()); + } // Created references to tail pages' pages in "log" storage, we need to write them down. wbs.writeLogAndData(); - ColumnFilePersisteds merged_persisted_column_files = std::move(left_persisted_files); - ColumnFiles merged_in_memory_files = std::move(left_in_memory_files); + auto merged_delta = std::make_shared(first_seg->delta->getId(), merged_persisted_column_files, merged_in_memory_files); - merged_persisted_column_files.insert(merged_persisted_column_files.end(), right_persisted_files.begin(), right_persisted_files.end()); - merged_in_memory_files.insert(merged_in_memory_files.end(), right_in_memory_files.begin(), right_in_memory_files.end()); - - auto merged_delta = std::make_shared(left->delta->getId(), merged_persisted_column_files, merged_in_memory_files); - - auto merged = std::make_shared(left->epoch + 1, // + auto merged = std::make_shared(first_seg->epoch + 1, // merged_range, - left->segment_id, - right->next_segment_id, + first_seg->segment_id, + last_seg->next_segment_id, merged_delta, merged_stable); @@ -1350,17 +1367,20 @@ SegmentPtr Segment::applyMerge(DMContext & dm_context, // merged->stable->saveMeta(wbs.meta); merged->serialize(wbs.meta); - left->delta->recordRemoveColumnFilesPages(wbs); - left->stable->recordRemovePacksPages(wbs); - - right->delta->recordRemoveColumnFilesPages(wbs); - right->stable->recordRemovePacksPages(wbs); - - wbs.removed_meta.delPage(right->segmentId()); - wbs.removed_meta.delPage(right->delta->getId()); - wbs.removed_meta.delPage(right->stable->getId()); + for (size_t i = 0; i < ordered_segments.size(); i++) + { + const auto & seg = ordered_segments[i]; + seg->delta->recordRemoveColumnFilesPages(wbs); + seg->stable->recordRemovePacksPages(wbs); + if (i > 0) // The first seg's id is preserved, so don't del id. + { + wbs.removed_meta.delPage(seg->segmentId()); + wbs.removed_meta.delPage(seg->delta->getId()); + wbs.removed_meta.delPage(seg->stable->getId()); + } + } - LOG_FMT_DEBUG(left->log, "Merge - Finish apply, left={} right={} merged={}", left->info(), right->info(), merged->info()); + LOG_FMT_DEBUG(log, "Merge - Finish apply, merged={} merged_from_segments={}", merged->info(), info(ordered_segments)); return merged; } @@ -1448,6 +1468,36 @@ String Segment::info() const stable->getBytes()); } +String Segment::simpleInfo(const std::vector & segments) +{ + FmtBuffer fmt_buf; + fmt_buf.fmtAppend("[{} segments: ", segments.size()); + fmt_buf.joinStr( + segments.cbegin(), + segments.cend(), + [&](const SegmentPtr & seg, FmtBuffer & fb) { + fb.append(seg->simpleInfo()); + }, + ", "); + fmt_buf.fmtAppend("]"); + return fmt_buf.toString(); +} + +String Segment::info(const std::vector & segments) +{ + FmtBuffer fmt_buf; + fmt_buf.fmtAppend("[{} segments: ", segments.size()); + fmt_buf.joinStr( + segments.cbegin(), + segments.cend(), + [&](const SegmentPtr & seg, FmtBuffer & fb) { + fb.append(seg->info()); + }, + ", "); + fmt_buf.fmtAppend("]"); + return fmt_buf.toString(); +} + void Segment::drop(const FileProviderPtr & file_provider, WriteBatches & wbs) { delta->recordRemoveColumnFilesPages(wbs); diff --git a/dbms/src/Storages/DeltaMerge/Segment.h b/dbms/src/Storages/DeltaMerge/Segment.h index 85a9b99c80d..aaf0755025f 100644 --- a/dbms/src/Storages/DeltaMerge/Segment.h +++ b/dbms/src/Storages/DeltaMerge/Segment.h @@ -195,59 +195,65 @@ class Segment : private boost::noncopyable /// For those split, merge and mergeDelta methods, we should use prepareXXX/applyXXX combo in real production. /// split(), merge() and mergeDelta() are only used in test cases. - SegmentPair split(DMContext & dm_context, const ColumnDefinesPtr & schema_snap) const; + /** + * Only used in tests as a shortcut. + * Normally you should use `prepareSplit` and `applySplit`. + */ + [[nodiscard]] SegmentPair split(DMContext & dm_context, const ColumnDefinesPtr & schema_snap) const; + std::optional prepareSplit( DMContext & dm_context, const ColumnDefinesPtr & schema_snap, const SegmentSnapshotPtr & segment_snap, WriteBatches & wbs) const; - SegmentPair applySplit( + [[nodiscard]] SegmentPair applySplit( DMContext & dm_context, const SegmentSnapshotPtr & segment_snap, WriteBatches & wbs, SplitInfo & split_info) const; - static SegmentPtr merge( + /** + * Only used in tests as a shortcut. + * Normally you should use `prepareMerge` and `applyMerge`. + */ + [[nodiscard]] static SegmentPtr merge( DMContext & dm_context, const ColumnDefinesPtr & schema_snap, - const SegmentPtr & left, - const SegmentPtr & right); + const std::vector & ordered_segments); + static StableValueSpacePtr prepareMerge( DMContext & dm_context, const ColumnDefinesPtr & schema_snap, - const SegmentPtr & left, - const SegmentSnapshotPtr & left_snap, - const SegmentPtr & right, - const SegmentSnapshotPtr & right_snap, + const std::vector & ordered_segments, + const std::vector & ordered_snapshots, WriteBatches & wbs); - static SegmentPtr applyMerge( + + [[nodiscard]] static SegmentPtr applyMerge( DMContext & dm_context, - const SegmentPtr & left, - const SegmentSnapshotPtr & left_snap, - const SegmentPtr & right, - const SegmentSnapshotPtr & right_snap, + const std::vector & ordered_segments, + const std::vector & ordered_snapshots, WriteBatches & wbs, const StableValueSpacePtr & merged_stable); - /// Merge the delta (major compaction) and return the new segment. - /// - /// Note: This is only a shortcut function used in tests. - /// Normally you should call `prepareMergeDelta`, `applyMergeDelta` instead. - SegmentPtr mergeDelta(DMContext & dm_context, const ColumnDefinesPtr & schema_snap) const; + /** + * Only used in tests as a shortcut. + * Normally you should use `prepareMergeDelta` and `applyMergeDelta`. + */ + [[nodiscard]] SegmentPtr mergeDelta(DMContext & dm_context, const ColumnDefinesPtr & schema_snap) const; StableValueSpacePtr prepareMergeDelta( DMContext & dm_context, const ColumnDefinesPtr & schema_snap, const SegmentSnapshotPtr & segment_snap, WriteBatches & wbs) const; - SegmentPtr applyMergeDelta( + [[nodiscard]] SegmentPtr applyMergeDelta( DMContext & dm_context, const SegmentSnapshotPtr & segment_snap, WriteBatches & wbs, const StableValueSpacePtr & new_stable) const; - SegmentPtr dropNextSegment(WriteBatches & wbs, const RowKeyRange & next_segment_range); + [[nodiscard]] SegmentPtr dropNextSegment(WriteBatches & wbs, const RowKeyRange & next_segment_range); /// Flush delta's cache packs. bool flushCache(DMContext & dm_context); @@ -274,6 +280,9 @@ class Segment : private boost::noncopyable String simpleInfo() const; String info() const; + static String simpleInfo(const std::vector & segments); + static String info(const std::vector & segments); + using Lock = DeltaValueSpace::Lock; bool getUpdateLock(Lock & lock) const { return delta->getLock(lock); } diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment.cpp index 491b2220603..dba988a9f7c 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment.cpp @@ -796,7 +796,7 @@ try // merge segments { - segment = Segment::merge(dmContext(), tableColumns(), segment, new_segment); + segment = Segment::merge(dmContext(), tableColumns(), {segment, new_segment}); { // check merged segment range const auto & merged_range = segment->getRowKeyRange(); @@ -1141,7 +1141,7 @@ try ASSERT_EQ(rows1 + rows2, (size_t)200); } - // Test merge + // Test merge two { WriteBatches wbs(dmContext().storage_pool); @@ -1152,7 +1152,7 @@ try write_100_rows(other_segment); segment->flushCache(dmContext()); - auto merged_stable = Segment::prepareMerge(dmContext(), tableColumns(), segment, left_snap, other_segment, right_snap, wbs); + auto merged_stable = Segment::prepareMerge(dmContext(), tableColumns(), {segment, other_segment}, {left_snap, right_snap}, wbs); wbs.writeLogAndData(); merged_stable->enableDMFilesGC(); @@ -1160,7 +1160,7 @@ try auto left_lock = segment->mustGetUpdateLock(); auto right_lock = other_segment->mustGetUpdateLock(); - segment = Segment::applyMerge(dmContext(), segment, left_snap, other_segment, right_snap, wbs, merged_stable); + segment = Segment::applyMerge(dmContext(), {segment, other_segment}, {left_snap, right_snap}, wbs, merged_stable); wbs.writeAll(); } @@ -1170,6 +1170,50 @@ try auto rows = read_rows(new_segment); ASSERT_EQ(rows, (size_t)300); } + + // Split into 3 + SegmentPtr seg1, seg2, seg3; + { + std::tie(seg1, seg2) = segment->split(dmContext(), tableColumns()); + ASSERT_TRUE(seg1); + ASSERT_TRUE(seg2); + std::tie(seg2, seg3) = seg2->split(dmContext(), tableColumns()); + ASSERT_TRUE(seg2); + ASSERT_TRUE(seg3); + } + +// { +// // TODO: This test case can be more readable when we support split at point. +// +// auto actual_new_rows = 0; +// { +// actual_new_rows += (seg2->getRowKeyRange().getEnd().int_value - seg2->getRowKeyRange().getStart().int_value); +// Block block = DMTestEnv::prepareSimpleWriteBlock(seg2->getRowKeyRange().getStart().int_value - 50, seg2->getRowKeyRange().getEnd().int_value + 50, false, /* tso */ 5); +// segment->write(dmContext(), std::move(block)); +// // Not flushed. +// } +// +// WriteBatches wbs(dmContext().storage_pool); +// auto snap1 = seg1->createSnapshot(dmContext(), true, CurrentMetrics::DT_SnapshotOfSegmentMerge); +// auto snap2 = seg2->createSnapshot(dmContext(), true, CurrentMetrics::DT_SnapshotOfSegmentMerge); +// auto snap3 = seg3->createSnapshot(dmContext(), true, CurrentMetrics::DT_SnapshotOfSegmentMerge); +// +// { +// actual_new_rows += 100; +// Block block = DMTestEnv::prepareSimpleWriteBlock(seg3->getRowKeyRange().getStart().int_value - 50, seg3->getRowKeyRange().getStart().int_value + 100, false, /* tso */ 5); +// segment->write(dmContext(), std::move(block)); +// // Not flushed, write after snapshot. +// } +// +// { +// actual_new_rows += 42; +// Block block = DMTestEnv::prepareSimpleWriteBlock(seg1->getRowKeyRange().getEnd().int_value - 42, seg1->getRowKeyRange().getEnd().int_value + 50, false, /* tso */ 5); +// segment->write(dmContext(), std::move(block)); +// segment->flushCache(dmContext()); // Flushed after snapshot. +// } +// +// +// } } CATCH diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment_common_handle.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment_common_handle.cpp index faa52c219e2..f577cb4a09d 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment_common_handle.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment_common_handle.cpp @@ -708,7 +708,7 @@ try // TODO: enable merge test! if (false) { - segment = Segment::merge(dmContext(), tableColumns(), segment, new_segment); + segment = Segment::merge(dmContext(), tableColumns(), {segment, new_segment}); { // check merged segment range const auto & merged_range = segment->getRowKeyRange(); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp index a53f2cae0a3..3d62367bcdd 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp @@ -65,7 +65,7 @@ try auto segment_id = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID); ASSERT_TRUE(segment_id.has_value()); - mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id); + mergeSegment({DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id}); } CATCH @@ -86,7 +86,7 @@ try flushSegmentCache(*segment_id); deleteRangeSegment(*segment_id); writeSegmentWithDeletedPack(*segment_id); - mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id); + mergeSegment({ DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id }); EXPECT_EQ(getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID), origin_rows); } @@ -95,7 +95,6 @@ CATCH TEST_F(SegmentOperationTest, TestSegmentRandom) try { - srand(time(nullptr)); SegmentTestOptions options; options.is_common_handle = true; reloadWithOptions(options); @@ -192,7 +191,7 @@ try th_seg_split.wait(); LOG_DEBUG(log, "finishApplySplit"); - mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, new_seg_id); + mergeSegment({DELTA_MERGE_FIRST_SEGMENT_ID, new_seg_id}); } for (const auto & [seg_id, seg] : segments) @@ -242,7 +241,7 @@ try // Start a segment merge and suspend it before applyMerge auto sp_seg_merge_apply = SyncPointCtl::enableInScope("before_Segment::applyMerge"); auto th_seg_merge = std::async([&]() { - mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, new_seg_id, /* check_rows */ false); + mergeSegment({DELTA_MERGE_FIRST_SEGMENT_ID, new_seg_id}, /* check_rows */ false); }); sp_seg_merge_apply.waitAndPause(); @@ -289,7 +288,6 @@ CATCH TEST_F(SegmentOperationTest, DISABLED_TestSegmentRandomForCI) try { - srand(time(nullptr)); SegmentTestOptions options; options.is_common_handle = true; reloadWithOptions(options); @@ -360,7 +358,7 @@ try // Start a segment merge and suspend it before applyMerge auto sp_seg_merge_apply = SyncPointCtl::enableInScope("before_Segment::applyMerge"); auto th_seg_merge = std::async([&]() { - mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, new_seg_id, /*check_rows=*/false); + mergeSegment({DELTA_MERGE_FIRST_SEGMENT_ID, new_seg_id}, /*check_rows=*/false); }); sp_seg_merge_apply.waitAndPause(); LOG_DEBUG(log, "pausedBeforeApplyMerge"); @@ -430,7 +428,7 @@ try // Start a segment merge and suspend it before applyMerge auto sp_seg_merge_apply = SyncPointCtl::enableInScope("before_Segment::applyMerge"); auto th_seg_merge = std::async([&]() { - mergeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, new_seg_id, /*check_rows=*/false); + mergeSegment({DELTA_MERGE_FIRST_SEGMENT_ID, new_seg_id}, /*check_rows=*/false); }); sp_seg_merge_apply.waitAndPause(); LOG_DEBUG(log, "pausedBeforeApplyMerge"); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp index 9da4401c924..e23213efa9a 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include #include #include @@ -39,6 +40,11 @@ namespace tests { void SegmentTestBasic::reloadWithOptions(SegmentTestOptions config) { + { + auto const seed = std::random_device{}(); + random = std::mt19937{seed}; + } + TiFlashStorageTestBasic::SetUp(); options = config; table_columns = std::make_shared(); @@ -94,41 +100,53 @@ std::optional SegmentTestBasic::splitSegment(PageId segment_id, bool che { auto origin_segment = segments[segment_id]; size_t origin_segment_row_num = getSegmentRowNum(segment_id); - SegmentPtr segment, new_segment; - std::tie(segment, new_segment) = origin_segment->split(dmContext(), tableColumns()); - if (new_segment) + auto [left, right] = origin_segment->split(dmContext(), tableColumns()); + if (left || right) { - segments[new_segment->segmentId()] = new_segment; - segments[segment_id] = segment; - + RUNTIME_CHECK(left && right); + RUNTIME_CHECK(left->segmentId() == segment_id, segment_id, left->info()); + segments[left->segmentId()] = left; // The left segment is updated + segments[right->segmentId()] = right; if (check_rows) - { - EXPECT_EQ(origin_segment_row_num, getSegmentRowNum(segment_id) + getSegmentRowNum(new_segment->segmentId())); - } - return new_segment->segmentId(); + EXPECT_EQ(origin_segment_row_num, getSegmentRowNum(segment_id) + getSegmentRowNum(right->segmentId())); + return right->segmentId(); } return std::nullopt; } -void SegmentTestBasic::mergeSegment(PageId left_segment_id, PageId right_segment_id, bool check_rows) +void SegmentTestBasic::mergeSegment(const std::vector & segments_id, bool check_rows) { - auto left_segment = segments[left_segment_id]; - auto right_segment = segments[right_segment_id]; + RUNTIME_CHECK(segments_id.size() >= 2, segments_id.size()); - size_t left_segment_row_num = getSegmentRowNum(left_segment_id); - size_t right_segment_row_num = getSegmentRowNum(right_segment_id); - LOG_FMT_TRACE(&Poco::Logger::root(), "merge in segment:{}:{} and {}:{}", left_segment->segmentId(), left_segment_row_num, right_segment->segmentId(), right_segment_row_num); + std::vector segments_to_merge; + std::vector segments_rows; + segments_to_merge.reserve(segments_id.size()); + segments_rows.reserve(segments_id.size()); - SegmentPtr merged_segment = Segment::merge(dmContext(), tableColumns(), left_segment, right_segment); - segments[merged_segment->segmentId()] = merged_segment; - auto it = segments.find(right_segment->segmentId()); - if (it != segments.end()) + for (const auto segment_id : segments_id) { - segments.erase(it); + auto it = segments.find(segment_id); + RUNTIME_CHECK(it != segments.end(), segment_id); + segments_to_merge.emplace_back(it->second); + segments_rows.emplace_back(getSegmentRowNum(segment_id)); } + + LOG_FMT_TRACE(&Poco::Logger::root(), "merge segment [{}], each_rows=[{}]", + fmt::join(segments_id, ","), + fmt::join(segments_rows, ",")); + + SegmentPtr merged_segment = Segment::merge(dmContext(), tableColumns(), segments_to_merge); + if (!merged_segment) + return; + + for (const auto segment_id : segments_id) + segments.erase(segments.find(segment_id)); + segments[merged_segment->segmentId()] = merged_segment; + if (check_rows) { - EXPECT_EQ(getSegmentRowNum(merged_segment->segmentId()), left_segment_row_num + right_segment_row_num); + auto merged_rows = std::accumulate(segments_rows.begin(), segments_rows.end(), 0); + EXPECT_EQ(getSegmentRowNum(merged_segment->segmentId()), merged_rows); } } @@ -161,15 +179,27 @@ std::pair SegmentTestBasic::getSegmentKeyRange(SegmentPtr segment) end_key = segment->getRowKeyRange().getEnd().int_value; return {start_key, end_key}; } - EXPECT_EQ(segment->getRowKeyRange().getStart().data[0], TiDB::CodecFlagInt); - EXPECT_EQ(segment->getRowKeyRange().getEnd().data[0], TiDB::CodecFlagInt); + + const auto & range = segment->getRowKeyRange(); + if (range.isStartInfinite()) + { + start_key = std::numeric_limits::min(); + } + else { + EXPECT_EQ(range.getStart().data[0], TiDB::CodecFlagInt); size_t cursor = 1; - start_key = DecodeInt64(cursor, String(segment->getRowKeyRange().getStart().data, segment->getRowKeyRange().getStart().size)); + start_key = DecodeInt64(cursor, String(range.getStart().data, range.getStart().size)); } + if (range.isEndInfinite()) + { + end_key = std::numeric_limits::max(); + } + else { + EXPECT_EQ(range.getEnd().data[0], TiDB::CodecFlagInt); size_t cursor = 1; - end_key = DecodeInt64(cursor, String(segment->getRowKeyRange().getEnd().data, segment->getRowKeyRange().getEnd().size)); + end_key = DecodeInt64(cursor, String(range.getEnd().data, range.getEnd().size)); } return {start_key, end_key}; } @@ -177,14 +207,11 @@ std::pair SegmentTestBasic::getSegmentKeyRange(SegmentPtr segment) void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows) { if (write_rows == 0) - { return; - } + auto segment = segments[segment_id]; size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id); - std::pair keys = getSegmentKeyRange(segment); - Int64 start_key = keys.first; - Int64 end_key = keys.second; + auto [start_key, end_key] = getSegmentKeyRange(segment); // If the length of segment key range is larger than `write_rows`, then // write the new data with the same tso in one block. @@ -221,9 +248,7 @@ void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows) void SegmentTestBasic::ingestDTFileIntoSegment(PageId segment_id, UInt64 write_rows) { if (write_rows == 0) - { return; - } auto write_data = [&](SegmentPtr segment, const Block & block) { WriteBatches ingest_wbs(dm_context->storage_pool, dm_context->getWriteLimiter()); @@ -258,9 +283,7 @@ void SegmentTestBasic::ingestDTFileIntoSegment(PageId segment_id, UInt64 write_r auto segment = segments[segment_id]; size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id); - std::pair keys = getSegmentKeyRange(segment); - Int64 start_key = keys.first; - Int64 end_key = keys.second; + auto [start_key, end_key] = getSegmentKeyRange(segment); // If the length of segment key range is larger than `write_rows`, then // write the new data with the same tso in one block. @@ -299,9 +322,7 @@ void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id) UInt64 write_rows = DEFAULT_MERGE_BLOCK_SIZE; auto segment = segments[segment_id]; size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id); - std::pair keys = getSegmentKeyRange(segment); - Int64 start_key = keys.first; - Int64 end_key = keys.second; + auto [start_key, end_key] = getSegmentKeyRange(segment); // If the length of segment key range is larger than `write_rows`, then // write the new data with the same tso in one block. @@ -345,138 +366,123 @@ void SegmentTestBasic::deleteRangeSegment(PageId segment_id) void SegmentTestBasic::writeRandomSegment() { if (segments.empty()) - { return; - } PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_TRACE(&Poco::Logger::root(), "start write segment:{}", random_segment_id); + LOG_FMT_DEBUG(&Poco::Logger::root(), "start write segment:{}", random_segment_id); writeSegment(random_segment_id); } void SegmentTestBasic::writeRandomSegmentWithDeletedPack() { if (segments.empty()) - { return; - } PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_TRACE(&Poco::Logger::root(), "start write segment with deleted pack:{}", random_segment_id); + LOG_FMT_DEBUG(&Poco::Logger::root(), "start write segment with deleted pack:{}", random_segment_id); writeSegmentWithDeletedPack(random_segment_id); } void SegmentTestBasic::deleteRangeRandomSegment() { if (segments.empty()) - { return; - } PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_TRACE(&Poco::Logger::root(), "start delete range segment:{}", random_segment_id); + LOG_FMT_DEBUG(&Poco::Logger::root(), "start delete range segment:{}", random_segment_id); deleteRangeSegment(random_segment_id); } void SegmentTestBasic::splitRandomSegment() { if (segments.empty()) - { return; - } PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_TRACE(&Poco::Logger::root(), "start split segment:{}", random_segment_id); + LOG_FMT_DEBUG(&Poco::Logger::root(), "start split segment:{}", random_segment_id); splitSegment(random_segment_id); } void SegmentTestBasic::mergeRandomSegment() { - if (segments.empty() || segments.size() == 1) - { + if (segments.size() < 2) return; - } - std::pair segment_pair; - segment_pair = getRandomMergeablePair(); - LOG_FMT_TRACE(&Poco::Logger::root(), "start merge segment:{} and {}", segment_pair.first, segment_pair.second); - mergeSegment(segment_pair.first, segment_pair.second); + auto segments_id = getRandomMergeableSegments(); + LOG_FMT_DEBUG(&Poco::Logger::root(), "start merge segments [{}]", fmt::join(segments_id, ",")); + mergeSegment(segments_id); } void SegmentTestBasic::mergeDeltaRandomSegment() { if (segments.empty()) - { return; - } PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_TRACE(&Poco::Logger::root(), "start merge delta in segment:{}", random_segment_id); + LOG_FMT_DEBUG(&Poco::Logger::root(), "start merge delta in segment:{}", random_segment_id); mergeSegmentDelta(random_segment_id); } void SegmentTestBasic::flushCacheRandomSegment() { if (segments.empty()) - { return; - } PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_TRACE(&Poco::Logger::root(), "start flush cache in segment:{}", random_segment_id); + LOG_FMT_DEBUG(&Poco::Logger::root(), "start flush cache in segment:{}", random_segment_id); flushSegmentCache(random_segment_id); } void SegmentTestBasic::randomSegmentTest(size_t operator_count) { + auto dist = std::uniform_int_distribution{0, static_cast(SegmentOperatorType::SegmentOperatorMax) - 1}; for (size_t i = 0; i < operator_count; i++) { - auto op = static_cast(random() % SegmentOperatorMax); + auto op = dist(random); segment_operator_entries[op](); } } -PageId SegmentTestBasic::getRandomSegmentId() +PageId SegmentTestBasic::getRandomSegmentId() // Complexity is O(n) { - auto max_segment_id = segments.rbegin()->first; - PageId random_segment_id = random() % (max_segment_id + 1); - auto it = segments.find(random_segment_id); - while (it == segments.end()) - { - random_segment_id = random() % (max_segment_id + 1); - it = segments.find(random_segment_id); - } - return random_segment_id; + RUNTIME_CHECK(!segments.empty()); + auto dist = std::uniform_int_distribution{0, segments.size() - 1}; + auto pick_n = dist(random); + auto it = segments.begin(); + std::advance(it, pick_n); + auto segment_id = it->second->segmentId(); + RUNTIME_CHECK(segments.find(segment_id) != segments.end(), segment_id); + RUNTIME_CHECK(segments[segment_id]->segmentId() == segment_id); + return segment_id; } -std::pair SegmentTestBasic::getRandomMergeablePair() +std::vector SegmentTestBasic::getRandomMergeableSegments() { + RUNTIME_CHECK(segments.size() >= 2, segments.size()); + + auto max_merge_segments = std::uniform_int_distribution{2, 6}(random); + + std::vector segments_id; + segments_id.reserve(max_merge_segments); + while (true) { - PageId random_left_segment_id = getRandomSegmentId(); - PageId random_right_segment_id = random_left_segment_id; - while (random_right_segment_id == random_left_segment_id) - { - random_right_segment_id = getRandomSegmentId(); - } - auto left_segment = segments[random_left_segment_id]; - auto right_segment = segments[random_right_segment_id]; - if (compare(left_segment->getRowKeyRange().getEnd(), right_segment->getRowKeyRange().getStart()) != 0 || left_segment->nextSegmentId() != right_segment->segmentId()) + segments_id.clear(); + segments_id.push_back(getRandomSegmentId()); + + for (int i = 1; i < max_merge_segments; i++) { - continue; + auto last_segment_id = segments_id.back(); + RUNTIME_CHECK(segments.find(last_segment_id) != segments.end(), last_segment_id); + auto last_segment = segments[last_segment_id]; + if (last_segment->getRowKeyRange().isEndInfinite()) + break; + + auto next_segment_id = last_segment->nextSegmentId(); + RUNTIME_CHECK(segments.find(next_segment_id) != segments.end(), last_segment->info()); + auto next_segment = segments[next_segment_id]; + RUNTIME_CHECK(next_segment->segmentId() == next_segment_id, next_segment->info(), next_segment_id); + RUNTIME_CHECK(compare(last_segment->getRowKeyRange().getEnd(), next_segment->getRowKeyRange().getStart()) == 0, last_segment->info(), next_segment->info()); + segments_id.push_back(next_segment_id); } - return {random_left_segment_id, random_right_segment_id}; - } -} -RowKeyRange SegmentTestBasic::commonHandleKeyRange() -{ - String start_key, end_key; - { - WriteBufferFromOwnString ss; - ::DB::EncodeUInt(static_cast(TiDB::CodecFlagInt), ss); - ::DB::EncodeInt64(std::numeric_limits::min(), ss); - start_key = ss.releaseStr(); + if (segments_id.size() >= 2) + break; } - { - WriteBufferFromOwnString ss; - ::DB::EncodeUInt(static_cast(TiDB::CodecFlagInt), ss); - ::DB::EncodeInt64(std::numeric_limits::max(), ss); - end_key = ss.releaseStr(); - } - return RowKeyRange(RowKeyValue(true, std::make_shared(start_key), 0), RowKeyValue(true, std::make_shared(end_key), 0), true, 1); + + return segments_id; } SegmentPtr SegmentTestBasic::reload(bool is_common_handle, const ColumnDefinesPtr & pre_define_columns, DB::Settings && db_settings) @@ -488,7 +494,7 @@ SegmentPtr SegmentTestBasic::reload(bool is_common_handle, const ColumnDefinesPt ColumnDefinesPtr cols = (!pre_define_columns) ? DMTestEnv::getDefaultColumns(is_common_handle ? DMTestEnv::PkType::CommonHandle : DMTestEnv::PkType::HiddenTiDBRowID) : pre_define_columns; setColumns(cols); - return Segment::newSegment(*dm_context, table_columns, is_common_handle ? commonHandleKeyRange() : RowKeyRange::newAll(is_common_handle, 1), storage_pool->newMetaPageId(), 0); + return Segment::newSegment(*dm_context, table_columns, RowKeyRange::newAll(is_common_handle, 1), storage_pool->newMetaPageId(), 0); } void SegmentTestBasic::setColumns(const ColumnDefinesPtr & columns) diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h index b8a37d5d38e..9db0cd15da0 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h @@ -21,6 +21,7 @@ #include #include +#include namespace DB { @@ -43,7 +44,7 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic // When `check_rows` is true, it will compare the rows num before and after the segment update. // So if there is some write during the segment update, it will report false failure if `check_rows` is true. std::optional splitSegment(PageId segment_id, bool check_rows = true); - void mergeSegment(PageId left_segment_id, PageId right_segment_id, bool check_rows = true); + void mergeSegment(const std::vector & segments, bool check_rows = true); void mergeSegmentDelta(PageId segment_id, bool check_rows = true); void flushSegmentCache(PageId segment_id); @@ -70,10 +71,12 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic std::pair getSegmentKeyRange(SegmentPtr segment); protected: + std::mt19937 random; + // std::map segments; - enum SegmentOperatorType + enum class SegmentOperatorType: size_t { Write = 0, DeleteRange, @@ -98,9 +101,7 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic PageId getRandomSegmentId(); - std::pair getRandomMergeablePair(); - - RowKeyRange commonHandleKeyRange(); + std::vector getRandomMergeableSegments(); SegmentPtr reload(bool is_common_handle, const ColumnDefinesPtr & pre_define_columns, DB::Settings && db_settings); From 2243b0e36fa47fede94e067eacae5fa66eefa529 Mon Sep 17 00:00:00 2001 From: Wish Date: Mon, 12 Sep 2022 23:25:07 +0800 Subject: [PATCH 07/17] grumble --- .../tests/gtest_segment_test_basic.cpp | 131 ++++++++++++------ .../tests/gtest_segment_test_basic.h | 52 +++---- 2 files changed, 113 insertions(+), 70 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp index e23213efa9a..5271cfb015d 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp @@ -45,6 +45,9 @@ void SegmentTestBasic::reloadWithOptions(SegmentTestOptions config) random = std::mt19937{seed}; } + logger = Logger::get("SegmentTest"); + logger_op = Logger::get("SegmentTestOperation"); + TiFlashStorageTestBasic::SetUp(); options = config; table_columns = std::make_shared(); @@ -88,34 +91,42 @@ size_t SegmentTestBasic::getSegmentRowNum(PageId segment_id) return getInputStreamNRows(in); } -void SegmentTestBasic::checkSegmentRow(PageId segment_id, size_t expected_row_num) -{ - auto segment = segments[segment_id]; - // read written data - auto in = segment->getInputStream(dmContext(), *tableColumns(), {segment->getRowKeyRange()}); - ASSERT_INPUTSTREAM_NROWS(in, expected_row_num); -} - std::optional SegmentTestBasic::splitSegment(PageId segment_id, bool check_rows) { + LOG_FMT_INFO(logger_op, "splitSegment, segment_id={}", segment_id); + auto origin_segment = segments[segment_id]; size_t origin_segment_row_num = getSegmentRowNum(segment_id); + + LOG_FMT_DEBUG(logger, "begin split, segment_id={} rows={}", segment_id, origin_segment_row_num); + auto [left, right] = origin_segment->split(dmContext(), tableColumns()); - if (left || right) + if (!left && !right) { - RUNTIME_CHECK(left && right); - RUNTIME_CHECK(left->segmentId() == segment_id, segment_id, left->info()); - segments[left->segmentId()] = left; // The left segment is updated - segments[right->segmentId()] = right; - if (check_rows) - EXPECT_EQ(origin_segment_row_num, getSegmentRowNum(segment_id) + getSegmentRowNum(right->segmentId())); - return right->segmentId(); + LOG_FMT_DEBUG(logger, "split not succeeded, segment_id={} rows={}", segment_id, origin_segment_row_num); + return std::nullopt; } - return std::nullopt; + + RUNTIME_CHECK(left && right); + RUNTIME_CHECK(left->segmentId() == segment_id, segment_id, left->info()); + segments[left->segmentId()] = left; // The left segment is updated + segments[right->segmentId()] = right; + + auto left_rows = getSegmentRowNum(segment_id); + auto right_rows = getSegmentRowNum(right->segmentId()); + + if (check_rows) + EXPECT_EQ(origin_segment_row_num, left_rows + right_rows); + + LOG_FMT_DEBUG(logger, "split finish, left_id={} left_rows={} right_id={} right_rows={}", left->segmentId(), left_rows, right->segmentId(), right_rows); + + return right->segmentId(); } void SegmentTestBasic::mergeSegment(const std::vector & segments_id, bool check_rows) { + LOG_FMT_INFO(logger_op, "mergeSegment, segments=[{}]", fmt::join(segments_id, ",")); + RUNTIME_CHECK(segments_id.size() >= 2, segments_id.size()); std::vector segments_to_merge; @@ -131,27 +142,30 @@ void SegmentTestBasic::mergeSegment(const std::vector & segments_id, boo segments_rows.emplace_back(getSegmentRowNum(segment_id)); } - LOG_FMT_TRACE(&Poco::Logger::root(), "merge segment [{}], each_rows=[{}]", - fmt::join(segments_id, ","), - fmt::join(segments_rows, ",")); + LOG_FMT_DEBUG(logger, "begin merge, segments=[{}] each_rows=[{}]", fmt::join(segments_id, ","), fmt::join(segments_rows, ",")); SegmentPtr merged_segment = Segment::merge(dmContext(), tableColumns(), segments_to_merge); if (!merged_segment) + { + LOG_FMT_DEBUG(logger, "merge not succeeded, segments=[{}] each_rows=[{}]", fmt::join(segments_id, ","), fmt::join(segments_rows, ",")); return; + } for (const auto segment_id : segments_id) segments.erase(segments.find(segment_id)); segments[merged_segment->segmentId()] = merged_segment; + int merged_rows = std::accumulate(segments_rows.begin(), segments_rows.end(), 0); if (check_rows) - { - auto merged_rows = std::accumulate(segments_rows.begin(), segments_rows.end(), 0); EXPECT_EQ(getSegmentRowNum(merged_segment->segmentId()), merged_rows); - } + + LOG_FMT_DEBUG(logger, "merge finish, merged_segment_id={} merge_from_segments=[{}] merged_rows={}", merged_segment->segmentId(), fmt::join(segments_id, ","), merged_rows); } void SegmentTestBasic::mergeSegmentDelta(PageId segment_id, bool check_rows) { + LOG_FMT_INFO(logger_op, "mergeSegmentDelta, segment_id={}", segment_id); + auto segment = segments[segment_id]; size_t segment_row_num = getSegmentRowNum(segment_id); SegmentPtr merged_segment = segment->mergeDelta(dmContext(), tableColumns()); @@ -164,6 +178,8 @@ void SegmentTestBasic::mergeSegmentDelta(PageId segment_id, bool check_rows) void SegmentTestBasic::flushSegmentCache(PageId segment_id) { + LOG_FMT_INFO(logger_op, "flushSegmentCache, segment_id={}", segment_id); + auto segment = segments[segment_id]; size_t segment_row_num = getSegmentRowNum(segment_id); segment->flushCache(dmContext()); @@ -206,13 +222,20 @@ std::pair SegmentTestBasic::getSegmentKeyRange(SegmentPtr segment) void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows) { + LOG_FMT_INFO(logger_op, "writeSegment, segment_id={} rows={}", segment_id, write_rows); + if (write_rows == 0) return; + RUNTIME_CHECK(write_rows > 0); + RUNTIME_CHECK(write_rows < std::numeric_limits::max()); + auto segment = segments[segment_id]; size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id); auto [start_key, end_key] = getSegmentKeyRange(segment); + LOG_FMT_DEBUG(logger, "write to segment, segment={} segment_rows={} start_key={} end_key={}", segment->info(), segment_row_num, start_key, end_key); + // If the length of segment key range is larger than `write_rows`, then // write the new data with the same tso in one block. // Otherwise create multiple block with increasing tso until the `remain_row_num` @@ -220,6 +243,10 @@ void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows) UInt64 remain_row_num = 0; if (static_cast(end_key - start_key) > write_rows) { + // The segment range is large enough, let's randomly pick a start key: + // Suppose we have segment range = [0, 11), which could contain at most 11 rows. + // Now we want to write 10 rows -- The write start key could be randomized in [0, 1]. + start_key = std::uniform_int_distribution{start_key, end_key - static_cast(write_rows)}(random); end_key = start_key + write_rows; } else @@ -228,18 +255,18 @@ void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows) } { // write to segment and not flush + LOG_FMT_DEBUG(logger, "write block to segment, block_range=[{}, {})", start_key, end_key); Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, end_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle); segment->write(dmContext(), std::move(block), false); - LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, end_key); version++; } while (remain_row_num > 0) { UInt64 write_num = std::min(remain_row_num, static_cast(end_key - start_key)); + LOG_FMT_DEBUG(logger, "write block to segment, block_range=[{}, {})", start_key, write_num + start_key); Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle); segment->write(dmContext(), std::move(block), false); remain_row_num -= write_num; - LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, write_num + start_key); version++; } EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows); @@ -247,6 +274,8 @@ void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows) void SegmentTestBasic::ingestDTFileIntoSegment(PageId segment_id, UInt64 write_rows) { + LOG_FMT_INFO(logger_op, "ingestDTFileIntoSegment, segment_id={} rows={}", segment_id, write_rows); + if (write_rows == 0) return; @@ -292,6 +321,7 @@ void SegmentTestBasic::ingestDTFileIntoSegment(PageId segment_id, UInt64 write_r UInt64 remain_row_num = 0; if (static_cast(end_key - start_key) > write_rows) { + start_key = std::uniform_int_distribution{start_key, end_key - static_cast(write_rows)}(random); end_key = start_key + write_rows; } else @@ -300,26 +330,27 @@ void SegmentTestBasic::ingestDTFileIntoSegment(PageId segment_id, UInt64 write_r } { // write to segment and not flush + LOG_FMT_DEBUG(logger, "ingest block to segment, block_range=[{}, {})", start_key, end_key); Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, end_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle); write_data(segment, block); - LOG_FMT_TRACE(&Poco::Logger::root(), "ingest key range [{}, {})", start_key, end_key); version++; } while (remain_row_num > 0) { UInt64 write_num = std::min(remain_row_num, static_cast(end_key - start_key)); + LOG_FMT_DEBUG(logger, "ingest block to segment, block_range=[{}, {})", start_key, write_num + start_key); Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle); write_data(segment, block); remain_row_num -= write_num; - LOG_FMT_TRACE(&Poco::Logger::root(), "ingest key range [{}, {})", start_key, write_num + start_key); version++; } EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows); } -void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id) +void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id, UInt64 write_rows) { - UInt64 write_rows = DEFAULT_MERGE_BLOCK_SIZE; + LOG_FMT_INFO(logger_op, "writeSegmentWithDeletedPack, segment_id={}", segment_id); + auto segment = segments[segment_id]; size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id); auto [start_key, end_key] = getSegmentKeyRange(segment); @@ -331,6 +362,7 @@ void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id) UInt64 remain_row_num = 0; if (static_cast(end_key - start_key) > write_rows) { + start_key = std::uniform_int_distribution{start_key, end_key - static_cast(write_rows)}(random); end_key = start_key + write_rows; } else @@ -339,18 +371,18 @@ void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id) } { // write to segment and not flush + LOG_FMT_DEBUG(logger, "write block to segment, block_range=[{}, {})", start_key, end_key); Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, end_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle, 1, true, true); segment->write(dmContext(), std::move(block), true); - LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, end_key); version++; } while (remain_row_num > 0) { UInt64 write_num = std::min(remain_row_num, static_cast(end_key - start_key)); + LOG_FMT_DEBUG(logger, "write block to segment, block_range=[{}, {})", start_key, write_num + start_key); Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle, 1, true, true); segment->write(dmContext(), std::move(block), true); remain_row_num -= write_num; - LOG_FMT_TRACE(&Poco::Logger::root(), "write key range [{}, {})", start_key, write_num + start_key); version++; } EXPECT_EQ(getSegmentRowNumWithoutMVCC(segment_id), segment_row_num + write_rows); @@ -358,6 +390,8 @@ void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id) void SegmentTestBasic::deleteRangeSegment(PageId segment_id) { + LOG_FMT_INFO(logger_op, "deleteRangeSegment, segment_id={}", segment_id); + auto segment = segments[segment_id]; segment->write(dmContext(), /*delete_range*/ segment->getRowKeyRange()); EXPECT_EQ(getSegmentRowNum(segment_id), 0); @@ -368,16 +402,19 @@ void SegmentTestBasic::writeRandomSegment() if (segments.empty()) return; PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_DEBUG(&Poco::Logger::root(), "start write segment:{}", random_segment_id); - writeSegment(random_segment_id); + auto write_rows = std::uniform_int_distribution{20, 100}(random); + LOG_FMT_DEBUG(logger, "start random write, segment_id={} write_rows={} all_segments={}", random_segment_id, write_rows, segments.size()); + writeSegment(random_segment_id, write_rows); } + void SegmentTestBasic::writeRandomSegmentWithDeletedPack() { if (segments.empty()) return; PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_DEBUG(&Poco::Logger::root(), "start write segment with deleted pack:{}", random_segment_id); - writeSegmentWithDeletedPack(random_segment_id); + auto write_rows = std::uniform_int_distribution{20, 100}(random); + LOG_FMT_DEBUG(logger, "start random write delete, segment_id={} write_rows={} all_segments={}", random_segment_id, write_rows, segments.size()); + writeSegmentWithDeletedPack(random_segment_id, write_rows); } void SegmentTestBasic::deleteRangeRandomSegment() @@ -385,7 +422,7 @@ void SegmentTestBasic::deleteRangeRandomSegment() if (segments.empty()) return; PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_DEBUG(&Poco::Logger::root(), "start delete range segment:{}", random_segment_id); + LOG_FMT_DEBUG(logger, "start random delete range, segment_id={} all_segments={}", random_segment_id, segments.size()); deleteRangeSegment(random_segment_id); } @@ -394,7 +431,7 @@ void SegmentTestBasic::splitRandomSegment() if (segments.empty()) return; PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_DEBUG(&Poco::Logger::root(), "start split segment:{}", random_segment_id); + LOG_FMT_DEBUG(logger, "start random split, segment_id={} all_segments={}", random_segment_id, segments.size()); splitSegment(random_segment_id); } @@ -403,7 +440,7 @@ void SegmentTestBasic::mergeRandomSegment() if (segments.size() < 2) return; auto segments_id = getRandomMergeableSegments(); - LOG_FMT_DEBUG(&Poco::Logger::root(), "start merge segments [{}]", fmt::join(segments_id, ",")); + LOG_FMT_DEBUG(logger, "start random merge, segments_id=[{}] all_segments={}", fmt::join(segments_id, ","), segments.size()); mergeSegment(segments_id); } @@ -412,7 +449,7 @@ void SegmentTestBasic::mergeDeltaRandomSegment() if (segments.empty()) return; PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_DEBUG(&Poco::Logger::root(), "start merge delta in segment:{}", random_segment_id); + LOG_FMT_DEBUG(logger, "start random merge delta, segment_id={} all_segments={}", random_segment_id, segments.size()); mergeSegmentDelta(random_segment_id); } @@ -421,21 +458,24 @@ void SegmentTestBasic::flushCacheRandomSegment() if (segments.empty()) return; PageId random_segment_id = getRandomSegmentId(); - LOG_FMT_DEBUG(&Poco::Logger::root(), "start flush cache in segment:{}", random_segment_id); + LOG_FMT_DEBUG(logger, "start random flush cache, segment_id={} all_segments={}", random_segment_id, segments.size()); flushSegmentCache(random_segment_id); } void SegmentTestBasic::randomSegmentTest(size_t operator_count) { - auto dist = std::uniform_int_distribution{0, static_cast(SegmentOperatorType::SegmentOperatorMax) - 1}; + auto probabilities = std::vector{}; + std::transform(segment_operator_entries.begin(), segment_operator_entries.end(), std::back_inserter(probabilities), [](auto v) { return v.first; }); + + auto dist = std::discrete_distribution{probabilities.begin(), probabilities.end()}; for (size_t i = 0; i < operator_count; i++) { - auto op = dist(random); - segment_operator_entries[op](); + auto op_idx = dist(random); + segment_operator_entries[op_idx].second(); } } -PageId SegmentTestBasic::getRandomSegmentId() // Complexity is O(n) +PageId SegmentTestBasic::getRandomSegmentId() // Complexity is O(n) { RUNTIME_CHECK(!segments.empty()); auto dist = std::uniform_int_distribution{0, segments.size() - 1}; @@ -452,7 +492,8 @@ std::vector SegmentTestBasic::getRandomMergeableSegments() { RUNTIME_CHECK(segments.size() >= 2, segments.size()); - auto max_merge_segments = std::uniform_int_distribution{2, 6}(random); + // Merge 2~6 segments (at most 1/2 of all segments). + auto max_merge_segments = std::uniform_int_distribution{2, std::clamp(static_cast(segments.size()) / 2, 2, 6)}(random); std::vector segments_id; segments_id.reserve(max_merge_segments); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h index 9db0cd15da0..4313875f1a3 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h @@ -20,8 +20,8 @@ #include #include -#include #include +#include namespace DB { @@ -50,7 +50,7 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic void flushSegmentCache(PageId segment_id); void writeSegment(PageId segment_id, UInt64 write_rows = 100); void ingestDTFileIntoSegment(PageId segment_id, UInt64 write_rows = 100); - void writeSegmentWithDeletedPack(PageId segment_id); + void writeSegmentWithDeletedPack(PageId segment_id, UInt64 write_rows = 100); void deleteRangeSegment(PageId segment_id); @@ -67,7 +67,6 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic PageId createNewSegmentWithSomeData(); size_t getSegmentRowNumWithoutMVCC(PageId segment_id); size_t getSegmentRowNum(PageId segment_id); - void checkSegmentRow(PageId segment_id, size_t expected_row_num); std::pair getSegmentKeyRange(SegmentPtr segment); protected: @@ -76,28 +75,28 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic // std::map segments; - enum class SegmentOperatorType: size_t - { - Write = 0, - DeleteRange, - Split, - Merge, - MergeDelta, - FlushCache, - WriteDeletedPack, - SegmentOperatorMax - }; - - const std::vector> segment_operator_entries = { - [this] { writeRandomSegment(); }, - [this] { deleteRangeRandomSegment(); }, - [this] { splitRandomSegment(); }, - [this] { mergeRandomSegment(); }, - [this] { mergeDeltaRandomSegment(); }, - [this] { flushCacheRandomSegment(); }, - [this] { - writeRandomSegmentWithDeletedPack(); - }}; + const std::vector>> segment_operator_entries = { + {1.0, [this] { + writeRandomSegment(); + }}, + {0.25, [this] { + deleteRangeRandomSegment(); + }}, + {1.0, [this] { + splitRandomSegment(); + }}, + {0.5, [this] { + mergeRandomSegment(); + }}, + {1.0, [this] { + mergeDeltaRandomSegment(); + }}, + {1.0, [this] { + flushCacheRandomSegment(); + }}, + {0.25, [this] { + writeRandomSegmentWithDeletedPack(); + }}}; PageId getRandomSegmentId(); @@ -124,6 +123,9 @@ class SegmentTestBasic : public DB::base::TiFlashStorageTestBasic SegmentPtr root_segment; UInt64 version = 0; SegmentTestOptions options; + + LoggerPtr logger_op; + LoggerPtr logger; }; } // namespace tests } // namespace DM From 31c8e638f24426eab83889722181f325526fee7d Mon Sep 17 00:00:00 2001 From: Wish Date: Tue, 13 Sep 2022 00:51:31 +0800 Subject: [PATCH 08/17] protect against non-insertable segment ranges --- .../tests/gtest_segment_test_basic.cpp | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp index 5271cfb015d..6f2ed32376e 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp @@ -236,12 +236,15 @@ void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows) LOG_FMT_DEBUG(logger, "write to segment, segment={} segment_rows={} start_key={} end_key={}", segment->info(), segment_row_num, start_key, end_key); + auto segment_max_rows = static_cast(end_key - start_key); + if (segment_max_rows == 0) + return; // If the length of segment key range is larger than `write_rows`, then // write the new data with the same tso in one block. // Otherwise create multiple block with increasing tso until the `remain_row_num` // down to 0. UInt64 remain_row_num = 0; - if (static_cast(end_key - start_key) > write_rows) + if (segment_max_rows > write_rows) { // The segment range is large enough, let's randomly pick a start key: // Suppose we have segment range = [0, 11), which could contain at most 11 rows. @@ -251,7 +254,7 @@ void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows) } else { - remain_row_num = write_rows - static_cast(end_key - start_key); + remain_row_num = write_rows - segment_max_rows; } { // write to segment and not flush @@ -262,7 +265,7 @@ void SegmentTestBasic::writeSegment(PageId segment_id, UInt64 write_rows) } while (remain_row_num > 0) { - UInt64 write_num = std::min(remain_row_num, static_cast(end_key - start_key)); + UInt64 write_num = std::min(remain_row_num, segment_max_rows); LOG_FMT_DEBUG(logger, "write block to segment, block_range=[{}, {})", start_key, write_num + start_key); Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle); segment->write(dmContext(), std::move(block), false); @@ -314,19 +317,22 @@ void SegmentTestBasic::ingestDTFileIntoSegment(PageId segment_id, UInt64 write_r size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id); auto [start_key, end_key] = getSegmentKeyRange(segment); + auto segment_max_rows = static_cast(end_key - start_key); + if (segment_max_rows == 0) + return; // If the length of segment key range is larger than `write_rows`, then // write the new data with the same tso in one block. // Otherwise create multiple block with increasing tso until the `remain_row_num` // down to 0. UInt64 remain_row_num = 0; - if (static_cast(end_key - start_key) > write_rows) + if (segment_max_rows > write_rows) { start_key = std::uniform_int_distribution{start_key, end_key - static_cast(write_rows)}(random); end_key = start_key + write_rows; } else { - remain_row_num = write_rows - static_cast(end_key - start_key); + remain_row_num = write_rows - segment_max_rows; } { // write to segment and not flush @@ -337,7 +343,7 @@ void SegmentTestBasic::ingestDTFileIntoSegment(PageId segment_id, UInt64 write_r } while (remain_row_num > 0) { - UInt64 write_num = std::min(remain_row_num, static_cast(end_key - start_key)); + UInt64 write_num = std::min(remain_row_num, segment_max_rows); LOG_FMT_DEBUG(logger, "ingest block to segment, block_range=[{}, {})", start_key, write_num + start_key); Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle); write_data(segment, block); @@ -355,19 +361,22 @@ void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id, UInt64 wri size_t segment_row_num = getSegmentRowNumWithoutMVCC(segment_id); auto [start_key, end_key] = getSegmentKeyRange(segment); + auto segment_max_rows = static_cast(end_key - start_key); + if (segment_max_rows == 0) + return; // If the length of segment key range is larger than `write_rows`, then // write the new data with the same tso in one block. // Otherwise create multiple block with increasing tso until the `remain_row_num` // down to 0. UInt64 remain_row_num = 0; - if (static_cast(end_key - start_key) > write_rows) + if (segment_max_rows > write_rows) { start_key = std::uniform_int_distribution{start_key, end_key - static_cast(write_rows)}(random); end_key = start_key + write_rows; } else { - remain_row_num = write_rows - static_cast(end_key - start_key); + remain_row_num = write_rows - segment_max_rows; } { // write to segment and not flush @@ -378,7 +387,7 @@ void SegmentTestBasic::writeSegmentWithDeletedPack(PageId segment_id, UInt64 wri } while (remain_row_num > 0) { - UInt64 write_num = std::min(remain_row_num, static_cast(end_key - start_key)); + UInt64 write_num = std::min(remain_row_num, segment_max_rows); LOG_FMT_DEBUG(logger, "write block to segment, block_range=[{}, {})", start_key, write_num + start_key); Block block = DMTestEnv::prepareSimpleWriteBlock(start_key, write_num + start_key, false, version, DMTestEnv::pk_name, EXTRA_HANDLE_COLUMN_ID, options.is_common_handle ? EXTRA_HANDLE_COLUMN_STRING_TYPE : EXTRA_HANDLE_COLUMN_INT_TYPE, options.is_common_handle, 1, true, true); segment->write(dmContext(), std::move(block), true); From 56b9a43d912041d99535aeebfd49c4fe94cbce32 Mon Sep 17 00:00:00 2001 From: Wish Date: Tue, 13 Sep 2022 01:25:51 +0800 Subject: [PATCH 09/17] Add unit tests for segment merge --- .../DeltaMerge/tests/gtest_dm_segment.cpp | 46 +-------- .../DeltaMerge/tests/gtest_segment.cpp | 99 +++++++++++++++++++ 2 files changed, 100 insertions(+), 45 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment.cpp index dba988a9f7c..79ae30298ff 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment.cpp @@ -1141,7 +1141,7 @@ try ASSERT_EQ(rows1 + rows2, (size_t)200); } - // Test merge two + // Test merge { WriteBatches wbs(dmContext().storage_pool); @@ -1170,50 +1170,6 @@ try auto rows = read_rows(new_segment); ASSERT_EQ(rows, (size_t)300); } - - // Split into 3 - SegmentPtr seg1, seg2, seg3; - { - std::tie(seg1, seg2) = segment->split(dmContext(), tableColumns()); - ASSERT_TRUE(seg1); - ASSERT_TRUE(seg2); - std::tie(seg2, seg3) = seg2->split(dmContext(), tableColumns()); - ASSERT_TRUE(seg2); - ASSERT_TRUE(seg3); - } - -// { -// // TODO: This test case can be more readable when we support split at point. -// -// auto actual_new_rows = 0; -// { -// actual_new_rows += (seg2->getRowKeyRange().getEnd().int_value - seg2->getRowKeyRange().getStart().int_value); -// Block block = DMTestEnv::prepareSimpleWriteBlock(seg2->getRowKeyRange().getStart().int_value - 50, seg2->getRowKeyRange().getEnd().int_value + 50, false, /* tso */ 5); -// segment->write(dmContext(), std::move(block)); -// // Not flushed. -// } -// -// WriteBatches wbs(dmContext().storage_pool); -// auto snap1 = seg1->createSnapshot(dmContext(), true, CurrentMetrics::DT_SnapshotOfSegmentMerge); -// auto snap2 = seg2->createSnapshot(dmContext(), true, CurrentMetrics::DT_SnapshotOfSegmentMerge); -// auto snap3 = seg3->createSnapshot(dmContext(), true, CurrentMetrics::DT_SnapshotOfSegmentMerge); -// -// { -// actual_new_rows += 100; -// Block block = DMTestEnv::prepareSimpleWriteBlock(seg3->getRowKeyRange().getStart().int_value - 50, seg3->getRowKeyRange().getStart().int_value + 100, false, /* tso */ 5); -// segment->write(dmContext(), std::move(block)); -// // Not flushed, write after snapshot. -// } -// -// { -// actual_new_rows += 42; -// Block block = DMTestEnv::prepareSimpleWriteBlock(seg1->getRowKeyRange().getEnd().int_value - 42, seg1->getRowKeyRange().getEnd().int_value + 50, false, /* tso */ 5); -// segment->write(dmContext(), std::move(block)); -// segment->flushCache(dmContext()); // Flushed after snapshot. -// } -// -// -// } } CATCH diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp index 3d62367bcdd..8bd4f1c5645 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp @@ -92,6 +92,105 @@ try } CATCH +TEST_F(SegmentOperationTest, TestSegmentMemTableDataAfterSplit) +try +{ + SegmentTestOptions options; + reloadWithOptions(options); + writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 100); + flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); + mergeSegmentDelta(DELTA_MERGE_FIRST_SEGMENT_ID); + + writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 70); // Write data without flush + auto segment_id_2nd = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID); + ASSERT_TRUE(segment_id_2nd.has_value()); + ASSERT_EQ(segments.size(), 2); + // The mem table data may be fallen in either segment (as we write randomly). + ASSERT_EQ(getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID) + getSegmentRowNum(*segment_id_2nd), 170); +} +CATCH + +TEST_F(SegmentOperationTest, TestSegmentMergeTwo) +try +{ + SegmentTestOptions options; + reloadWithOptions(options); + writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 100); + flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); + mergeSegmentDelta(DELTA_MERGE_FIRST_SEGMENT_ID); + + auto segment_id_2nd = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID); + // now we have segments = { DELTA_MERGE_FIRST_SEGMENT_ID, segment_id_2nd } + ASSERT_TRUE(segment_id_2nd.has_value()); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(DELTA_MERGE_FIRST_SEGMENT_ID), 50); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 50); + ASSERT_EQ(segments.size(), 2); + + auto segment_id_3rd = splitSegment(*segment_id_2nd); + // now we have segments = { DELTA_MERGE_FIRST_SEGMENT_ID, segment_id_2nd, segment_id_3rd } + ASSERT_TRUE(segment_id_3rd.has_value()); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 25); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_3rd), 25); + ASSERT_EQ(segments.size(), 3); + + writeSegment(*segment_id_2nd, 7); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 25+7); + mergeSegment({ *segment_id_2nd, *segment_id_3rd }); + // now we have segments = { DELTA_MERGE_FIRST_SEGMENT_ID, segment_id_2nd } + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 50+7); + ASSERT_TRUE(segments.find(*segment_id_3rd) == segments.end()); + ASSERT_EQ(segments.size(), 2); +} +CATCH + +TEST_F(SegmentOperationTest, TestSegmentMergeThree) +try +{ + SegmentTestOptions options; + reloadWithOptions(options); + writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 100); + flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); + mergeSegmentDelta(DELTA_MERGE_FIRST_SEGMENT_ID); + + auto segment_id_2nd = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID); + auto segment_id_3rd = splitSegment(*segment_id_2nd); + // now we have segments = { DELTA_MERGE_FIRST_SEGMENT_ID, segment_id_2nd, segment_id_3rd } + ASSERT_EQ(segments.size(), 3); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(DELTA_MERGE_FIRST_SEGMENT_ID), 50); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 25); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_3rd), 25); + + writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 11); + flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(DELTA_MERGE_FIRST_SEGMENT_ID), 50+11); + writeSegment(*segment_id_2nd, 7); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 25+7); + mergeSegment({ DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id_2nd, *segment_id_3rd }); + // now we have segments = { DELTA_MERGE_FIRST_SEGMENT_ID } + ASSERT_EQ(getSegmentRowNumWithoutMVCC(DELTA_MERGE_FIRST_SEGMENT_ID), 100+11+7); + ASSERT_TRUE(segments.find(*segment_id_2nd) == segments.end()); + ASSERT_TRUE(segments.find(*segment_id_3rd) == segments.end()); + ASSERT_EQ(segments.size(), 1); +} +CATCH + +TEST_F(SegmentOperationTest, TestSegmentMergeInvalid) +try +{ + SegmentTestOptions options; + reloadWithOptions(options); + writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 100); + flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); + mergeSegmentDelta(DELTA_MERGE_FIRST_SEGMENT_ID); + + auto segment_id_2nd = splitSegment(DELTA_MERGE_FIRST_SEGMENT_ID); + auto segment_id_3rd = splitSegment(*segment_id_2nd); + // now we have segments = { DELTA_MERGE_FIRST_SEGMENT_ID, segment_id_2nd, segment_id_3rd } + + ASSERT_THROW({ mergeSegment({ DELTA_MERGE_FIRST_SEGMENT_ID, /* omit segment_id_2nd */ *segment_id_3rd }); }, DB::Exception); +} +CATCH + TEST_F(SegmentOperationTest, TestSegmentRandom) try { From 6d5ff10ad7ebd8cb39f78d07bb9c6981ebdb1edc Mon Sep 17 00:00:00 2001 From: Wish Date: Tue, 13 Sep 2022 01:26:09 +0800 Subject: [PATCH 10/17] Reformat --- .../DeltaMerge/tests/gtest_segment.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp index 8bd4f1c5645..1714eb7b62e 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment.cpp @@ -86,7 +86,7 @@ try flushSegmentCache(*segment_id); deleteRangeSegment(*segment_id); writeSegmentWithDeletedPack(*segment_id); - mergeSegment({ DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id }); + mergeSegment({DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id}); EXPECT_EQ(getSegmentRowNum(DELTA_MERGE_FIRST_SEGMENT_ID), origin_rows); } @@ -134,10 +134,10 @@ try ASSERT_EQ(segments.size(), 3); writeSegment(*segment_id_2nd, 7); - ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 25+7); - mergeSegment({ *segment_id_2nd, *segment_id_3rd }); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 25 + 7); + mergeSegment({*segment_id_2nd, *segment_id_3rd}); // now we have segments = { DELTA_MERGE_FIRST_SEGMENT_ID, segment_id_2nd } - ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 50+7); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 50 + 7); ASSERT_TRUE(segments.find(*segment_id_3rd) == segments.end()); ASSERT_EQ(segments.size(), 2); } @@ -162,12 +162,12 @@ try writeSegment(DELTA_MERGE_FIRST_SEGMENT_ID, 11); flushSegmentCache(DELTA_MERGE_FIRST_SEGMENT_ID); - ASSERT_EQ(getSegmentRowNumWithoutMVCC(DELTA_MERGE_FIRST_SEGMENT_ID), 50+11); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(DELTA_MERGE_FIRST_SEGMENT_ID), 50 + 11); writeSegment(*segment_id_2nd, 7); - ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 25+7); - mergeSegment({ DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id_2nd, *segment_id_3rd }); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(*segment_id_2nd), 25 + 7); + mergeSegment({DELTA_MERGE_FIRST_SEGMENT_ID, *segment_id_2nd, *segment_id_3rd}); // now we have segments = { DELTA_MERGE_FIRST_SEGMENT_ID } - ASSERT_EQ(getSegmentRowNumWithoutMVCC(DELTA_MERGE_FIRST_SEGMENT_ID), 100+11+7); + ASSERT_EQ(getSegmentRowNumWithoutMVCC(DELTA_MERGE_FIRST_SEGMENT_ID), 100 + 11 + 7); ASSERT_TRUE(segments.find(*segment_id_2nd) == segments.end()); ASSERT_TRUE(segments.find(*segment_id_3rd) == segments.end()); ASSERT_EQ(segments.size(), 1); @@ -187,7 +187,7 @@ try auto segment_id_3rd = splitSegment(*segment_id_2nd); // now we have segments = { DELTA_MERGE_FIRST_SEGMENT_ID, segment_id_2nd, segment_id_3rd } - ASSERT_THROW({ mergeSegment({ DELTA_MERGE_FIRST_SEGMENT_ID, /* omit segment_id_2nd */ *segment_id_3rd }); }, DB::Exception); + ASSERT_THROW({ mergeSegment({DELTA_MERGE_FIRST_SEGMENT_ID, /* omit segment_id_2nd */ *segment_id_3rd}); }, DB::Exception); } CATCH From 9a975379a502fead7e6c148de2597c0faef519dd Mon Sep 17 00:00:00 2001 From: Wish Date: Tue, 13 Sep 2022 10:05:41 +0800 Subject: [PATCH 11/17] Fix issues reported by static analysis Signed-off-by: Wish --- dbms/src/Storages/DeltaMerge/Segment.cpp | 1 + .../tests/gtest_dm_segment_common_handle.cpp | 24 ++++++++----------- .../tests/gtest_segment_test_basic.cpp | 7 ++++-- .../tests/gtest_segment_test_basic.h | 1 + 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Segment.cpp b/dbms/src/Storages/DeltaMerge/Segment.cpp index 78cc9e936df..e567322a0e8 100644 --- a/dbms/src/Storages/DeltaMerge/Segment.cpp +++ b/dbms/src/Storages/DeltaMerge/Segment.cpp @@ -1237,6 +1237,7 @@ SegmentPtr Segment::merge(DMContext & dm_context, const ColumnDefinesPtr & schem SYNC_FOR("before_Segment::applyMerge"); // pause without holding the lock on segments to be merged std::vector locks; + locks.reserve(ordered_segments.size()); for (const auto & seg : ordered_segments) locks.emplace_back(seg->mustGetUpdateLock()); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment_common_handle.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment_common_handle.cpp index f577cb4a09d..64d968bce3e 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment_common_handle.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_segment_common_handle.cpp @@ -705,21 +705,17 @@ try ASSERT_EQ(num_rows_seg1 + num_rows_seg2, num_rows_write); // merge segments - // TODO: enable merge test! - if (false) + segment = Segment::merge(dmContext(), tableColumns(), {segment, new_segment}); { - segment = Segment::merge(dmContext(), tableColumns(), {segment, new_segment}); - { - // check merged segment range - const auto & merged_range = segment->getRowKeyRange(); - EXPECT_EQ(*merged_range.start.value, *s1_range.start.value); - EXPECT_EQ(*merged_range.end.value, *s2_range.end.value); - // TODO check segment epoch is increase - } - { - auto in = segment->getInputStream(dmContext(), *tableColumns(), {RowKeyRange::newAll(is_common_handle, rowkey_column_size)}); - ASSERT_INPUTSTREAM_NROWS(in, num_rows_write); - } + // check merged segment range + const auto & merged_range = segment->getRowKeyRange(); + EXPECT_EQ(*merged_range.start.value, *s1_range.start.value); + EXPECT_EQ(*merged_range.end.value, *s2_range.end.value); + // TODO check segment epoch is increase + } + { + auto in = segment->getInputStream(dmContext(), *tableColumns(), {RowKeyRange::newAll(is_common_handle, rowkey_column_size)}); + ASSERT_INPUTSTREAM_NROWS(in, num_rows_write); } } CATCH diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp index 6f2ed32376e..0eee83b54c7 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.cpp @@ -131,6 +131,7 @@ void SegmentTestBasic::mergeSegment(const std::vector & segments_id, boo std::vector segments_to_merge; std::vector segments_rows; + size_t merged_rows = 0; segments_to_merge.reserve(segments_id.size()); segments_rows.reserve(segments_id.size()); @@ -139,7 +140,10 @@ void SegmentTestBasic::mergeSegment(const std::vector & segments_id, boo auto it = segments.find(segment_id); RUNTIME_CHECK(it != segments.end(), segment_id); segments_to_merge.emplace_back(it->second); - segments_rows.emplace_back(getSegmentRowNum(segment_id)); + + auto rows = getSegmentRowNum(segment_id); + segments_rows.emplace_back(rows); + merged_rows += rows; } LOG_FMT_DEBUG(logger, "begin merge, segments=[{}] each_rows=[{}]", fmt::join(segments_id, ","), fmt::join(segments_rows, ",")); @@ -155,7 +159,6 @@ void SegmentTestBasic::mergeSegment(const std::vector & segments_id, boo segments.erase(segments.find(segment_id)); segments[merged_segment->segmentId()] = merged_segment; - int merged_rows = std::accumulate(segments_rows.begin(), segments_rows.end(), 0); if (check_rows) EXPECT_EQ(getSegmentRowNum(merged_segment->segmentId()), merged_rows); diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h index 4313875f1a3..5a07066cea9 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_segment_test_basic.h @@ -15,6 +15,7 @@ #include #include +#include #include #include #include From 9b4f09e84b518cae205a38a436caaa4516e43f87 Mon Sep 17 00:00:00 2001 From: Wish Date: Tue, 13 Sep 2022 14:12:08 +0800 Subject: [PATCH 12/17] Address comments Signed-off-by: Wish --- .../DeltaMerge/DeltaMergeStore_InternalSegment.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp index 8a74b3e1372..39a6e58d0c6 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp @@ -212,19 +212,17 @@ SegmentPtr DeltaMergeStore::segmentMerge(DMContext & dm_context, const std::vect { if (!isSegmentValid(lock, seg)) { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because not valid, segment={}", seg->info()); + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because not valid, segment={}", seg->simpleInfo()); return {}; } } for (const auto & seg : ordered_segments) { - // TODO: Should we ensure the ordering of "segments" first? - auto snap = seg->createSnapshot(dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfSegmentMerge); if (!snap) { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because snapshot failed, segment={}", seg->info()); + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because snapshot failed, segment={}", seg->simpleInfo()); return {}; } @@ -269,7 +267,7 @@ SegmentPtr DeltaMergeStore::segmentMerge(DMContext & dm_context, const std::vect { if (!isSegmentValid(lock, seg)) { - LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because not valid, segment={}", seg->info()); + LOG_FMT_DEBUG(log, "Merge - Give up segmentMerge because not valid, segment={}", seg->simpleInfo()); wbs.setRollback(); return {}; } From 4f6235e5a80c87505857eba64ce1ca509188a812 Mon Sep 17 00:00:00 2001 From: Wish Date: Tue, 13 Sep 2022 15:20:14 +0800 Subject: [PATCH 13/17] refactor: Move storage bg out Signed-off-by: Wish --- .../Storages/DeltaMerge/DeltaMergeStore.cpp | 413 ---------------- .../DeltaMerge/DeltaMergeStore_InternalBg.cpp | 450 ++++++++++++++++++ 2 files changed, 450 insertions(+), 413 deletions(-) create mode 100644 dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp index 0f1593573ab..018c601d609 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp @@ -66,7 +66,6 @@ extern const Metric DT_DeltaMergeTotalBytes; extern const Metric DT_DeltaMergeTotalRows; extern const Metric DT_SnapshotOfRead; extern const Metric DT_SnapshotOfReadRaw; -extern const Metric DT_SnapshotOfDeltaMerge; extern const Metric DT_SnapshotOfPlaceIndex; } // namespace CurrentMetrics @@ -80,8 +79,6 @@ extern const int LOGICAL_ERROR; namespace FailPoints { extern const char skip_check_segment_update[]; -extern const char pause_before_dt_background_delta_merge[]; -extern const char pause_until_dt_background_delta_merge[]; extern const char pause_when_writing_to_dt_store[]; extern const char pause_when_altering_dt_store[]; extern const char force_triggle_background_merge_delta[]; @@ -289,120 +286,6 @@ DeltaMergeStore::~DeltaMergeStore() LOG_FMT_INFO(log, "Release DeltaMerge Store end"); } -void DeltaMergeStore::setUpBackgroundTask(const DMContextPtr & dm_context) -{ - // Callbacks for cleaning outdated DTFiles. Note that there is a chance - // that callbacks is called after the `DeltaMergeStore` dropped, we must - // make the callbacks safe. - ExternalPageCallbacks callbacks; - callbacks.ns_id = storage_pool->getNamespaceId(); - callbacks.scanner = [path_pool_weak_ref = std::weak_ptr(path_pool), file_provider = global_context.getFileProvider()]() { - ExternalPageCallbacks::PathAndIdsVec path_and_ids_vec; - - // If the StoragePathPool is invalid, meaning we call `scanner` after dropping the table, - // simply return an empty list is OK. - auto path_pool = path_pool_weak_ref.lock(); - if (!path_pool) - return path_and_ids_vec; - - // Return the DTFiles on disks. - auto delegate = path_pool->getStableDiskDelegator(); - // Only return the DTFiles can be GC. The page id of not able to be GC files, which is being ingested or in the middle of - // SegmentSplit/Merge/MergeDelta, is not yet applied - // to PageStorage is marked as not able to be GC, so we don't return them and run the `remover` - DMFile::ListOptions options; - options.only_list_can_gc = true; - for (auto & root_path : delegate.listPaths()) - { - std::set ids_under_path; - auto file_ids_in_current_path = DMFile::listAllInPath(file_provider, root_path, options); - path_and_ids_vec.emplace_back(root_path, std::move(file_ids_in_current_path)); - } - return path_and_ids_vec; - }; - callbacks.remover = [path_pool_weak_ref = std::weak_ptr(path_pool), // - file_provider = global_context.getFileProvider(), - logger = log](const ExternalPageCallbacks::PathAndIdsVec & path_and_ids_vec, const std::set & valid_ids) { - // If the StoragePathPool is invalid, meaning we call `remover` after dropping the table, - // simply skip is OK. - auto path_pool = path_pool_weak_ref.lock(); - if (!path_pool) - return; - - SYNC_FOR("before_DeltaMergeStore::callbacks_remover_remove"); - auto delegate = path_pool->getStableDiskDelegator(); - for (const auto & [path, ids] : path_and_ids_vec) - { - for (auto id : ids) - { - if (valid_ids.count(id)) - continue; - - // Note that page_id is useless here. - auto dmfile = DMFile::restore(file_provider, id, /* page_id= */ 0, path, DMFile::ReadMetaMode::none()); - if (unlikely(!dmfile)) - { - // If the dtfile directory is not exist, it means `StoragePathPool::drop` have been - // called in another thread. Just try to clean if any id is left. - try - { - delegate.removeDTFile(id); - } - catch (DB::Exception & e) - { - // just ignore - } - LOG_FMT_INFO(logger, - "GC try remove useless DM file, but file not found and may have been removed, dmfile={}", - DMFile::getPathByStatus(path, id, DMFile::Status::READABLE)); - } - else if (dmfile->canGC()) - { - // StoragePathPool::drop may be called concurrently, ignore and continue next file if any exception thrown - String err_msg; - try - { - // scanner should only return dtfiles that can GC, - // just another check here. - delegate.removeDTFile(dmfile->fileId()); - dmfile->remove(file_provider); - } - catch (DB::Exception & e) - { - err_msg = e.message(); - } - catch (Poco::Exception & e) - { - err_msg = e.message(); - } - if (err_msg.empty()) - LOG_FMT_INFO(logger, "GC removed useless DM file, dmfile={}", dmfile->path()); - else - LOG_FMT_INFO(logger, "GC try remove useless DM file, but error happen, dmfile={} err_msg={}", dmfile->path(), err_msg); - } - } - } - }; - // remember to unregister it when shutdown - storage_pool->dataRegisterExternalPagesCallbacks(callbacks); - storage_pool->enableGC(); - - background_task_handle = background_pool.addTask([this] { return handleBackgroundTask(false); }); - - blockable_background_pool_handle = blockable_background_pool.addTask([this] { return handleBackgroundTask(true); }); - - // Do place delta index. - for (auto & [end, segment] : segments) - { - (void)end; - checkSegmentUpdate(dm_context, segment, ThreadType::Init); - } - - // Wake up to do place delta index tasks. - background_task_handle->wake(); - blockable_background_pool_handle->wake(); -} - void DeltaMergeStore::rename(String /*new_path*/, String new_database_name, String new_table_name) { path_pool->rename(new_database_name, new_table_name); @@ -1533,302 +1416,6 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const // The segment does not need any updates for now. } -bool DeltaMergeStore::updateGCSafePoint() -{ - if (auto pd_client = global_context.getTMTContext().getPDClient(); !pd_client->isMock()) - { - auto safe_point = PDClientHelper::getGCSafePointWithRetry( - pd_client, - /* ignore_cache= */ false, - global_context.getSettingsRef().safe_point_update_interval_seconds); - latest_gc_safe_point.store(safe_point, std::memory_order_release); - return true; - } - return false; -} - -bool DeltaMergeStore::handleBackgroundTask(bool heavy) -{ - auto task = background_tasks.nextTask(heavy, log); - if (!task) - return false; - - // Update GC safe point before background task - // Foreground task don't get GC safe point from remote, but we better make it as up to date as possible. - if (updateGCSafePoint()) - { - /// Note that `task.dm_context->db_context` will be free after query is finish. We should not use that in background task. - task.dm_context->min_version = latest_gc_safe_point.load(std::memory_order_relaxed); - LOG_FMT_DEBUG(log, "Task {} GC safe point: {}", toString(task.type), task.dm_context->min_version); - } - - SegmentPtr left, right; - ThreadType type = ThreadType::Write; - try - { - switch (task.type) - { - case TaskType::Split: - std::tie(left, right) = segmentSplit(*task.dm_context, task.segment, false); - type = ThreadType::BG_Split; - break; - case TaskType::Merge: - segmentMerge(*task.dm_context, {task.segment, task.next_segment}, false); - type = ThreadType::BG_Merge; - break; - case TaskType::MergeDelta: - { - FAIL_POINT_PAUSE(FailPoints::pause_before_dt_background_delta_merge); - left = segmentMergeDelta(*task.dm_context, task.segment, MergeDeltaReason::BackgroundThreadPool); - type = ThreadType::BG_MergeDelta; - // Wake up all waiting threads if failpoint is enabled - FailPointHelper::disableFailPoint(FailPoints::pause_until_dt_background_delta_merge); - break; - } - case TaskType::Compact: - task.segment->compactDelta(*task.dm_context); - left = task.segment; - type = ThreadType::BG_Compact; - break; - case TaskType::Flush: - task.segment->flushCache(*task.dm_context); - // After flush cache, better place delta index. - task.segment->placeDeltaIndex(*task.dm_context); - left = task.segment; - type = ThreadType::BG_Flush; - break; - case TaskType::PlaceIndex: - task.segment->placeDeltaIndex(*task.dm_context); - break; - default: - throw Exception(fmt::format("Unsupported task type: {}", toString(task.type))); - } - } - catch (const Exception & e) - { - LOG_FMT_ERROR( - log, - "Execute task on segment failed, task={} segment={}{} err={}", - DeltaMergeStore::toString(task.type), - task.segment->simpleInfo(), - ((bool)task.next_segment ? (fmt::format(" next_segment={}", task.next_segment->simpleInfo())) : ""), - e.message()); - e.rethrow(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - throw; - } - - // continue to check whether we need to apply more tasks after this task is ended. - if (left) - checkSegmentUpdate(task.dm_context, left, type); - if (right) - checkSegmentUpdate(task.dm_context, right, type); - - return true; -} - -namespace GC -{ -// Returns true if it needs gc. -// This is for optimization purpose, does not mean to be accurate. -bool shouldCompactStable(const SegmentPtr & seg, DB::Timestamp gc_safepoint, double ratio_threshold, const LoggerPtr & log) -{ - // Always GC. - if (ratio_threshold < 1.0) - return true; - - const auto & property = seg->getStable()->getStableProperty(); - LOG_FMT_TRACE(log, "{}", property.toDebugString()); - // No data older than safe_point to GC. - if (property.gc_hint_version > gc_safepoint) - return false; - // A lot of MVCC versions to GC. - if (property.num_versions > property.num_rows * ratio_threshold) - return true; - // A lot of non-effective MVCC versions to GC. - if (property.num_versions > property.num_puts * ratio_threshold) - return true; - return false; -} - -bool shouldCompactDeltaWithStable(const DMContext & context, const SegmentSnapshotPtr & snap, const RowKeyRange & segment_range, double ratio_threshold, const LoggerPtr & log) -{ - auto actual_delete_range = snap->delta->getSquashDeleteRange().shrink(segment_range); - if (actual_delete_range.none()) - return false; - - auto [delete_rows, delete_bytes] = snap->stable->getApproxRowsAndBytes(context, actual_delete_range); - - auto stable_rows = snap->stable->getRows(); - auto stable_bytes = snap->stable->getBytes(); - - LOG_FMT_TRACE(log, "delete range rows [{}], delete_bytes [{}] stable_rows [{}] stable_bytes [{}]", delete_rows, delete_bytes, stable_rows, stable_bytes); - - // 1. for small tables, the data may just reside in delta and stable_rows may be 0, - // so the `=` in `>=` is needed to cover the scenario when set tiflash replica of small tables to 0. - // (i.e. `actual_delete_range` is not none, but `delete_rows` and `stable_rows` are both 0). - // 2. the disadvantage of `=` in `>=` is that it may trigger an extra gc when write apply snapshot file to an empty segment, - // because before write apply snapshot file, it will write a delete range first, and will meet the following gc criteria. - // But the cost should be really minor because merge delta on an empty segment should be very fast. - // What's more, we can ignore this kind of delete range in future to avoid this extra gc. - bool should_compact = (delete_rows >= stable_rows * ratio_threshold) || (delete_bytes >= stable_bytes * ratio_threshold); - return should_compact; -} -} // namespace GC - -UInt64 DeltaMergeStore::onSyncGc(Int64 limit) -{ - if (shutdown_called.load(std::memory_order_relaxed)) - return 0; - - if (!updateGCSafePoint()) - return 0; - - { - std::shared_lock lock(read_write_mutex); - // avoid gc on empty tables - if (segments.size() == 1) - { - const auto & seg = segments.begin()->second; - if (seg->getEstimatedRows() == 0) - return 0; - } - } - - DB::Timestamp gc_safe_point = latest_gc_safe_point.load(std::memory_order_acquire); - LOG_FMT_TRACE(log, - "GC on table {} start with key: {}, gc_safe_point: {}, max gc limit: {}", - table_name, - next_gc_check_key.toDebugString(), - gc_safe_point, - limit); - - UInt64 check_segments_num = 0; - Int64 gc_segments_num = 0; - while (gc_segments_num < limit) - { - // If the store is shut down, give up running GC on it. - if (shutdown_called.load(std::memory_order_relaxed)) - break; - - auto dm_context = newDMContext(global_context, global_context.getSettingsRef(), "onSyncGc"); - SegmentPtr segment; - SegmentSnapshotPtr segment_snap; - { - std::shared_lock lock(read_write_mutex); - - auto segment_it = segments.upper_bound(next_gc_check_key.toRowKeyValueRef()); - if (segment_it == segments.end()) - segment_it = segments.begin(); - - // we have check all segments, stop here - if (check_segments_num >= segments.size()) - break; - check_segments_num++; - - segment = segment_it->second; - next_gc_check_key = segment_it->first.toRowKeyValue(); - segment_snap = segment->createSnapshot(*dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfDeltaMerge); - } - - assert(segment != nullptr); - if (segment->hasAbandoned() || segment_snap == nullptr) - continue; - - const auto segment_id = segment->segmentId(); - RowKeyRange segment_range = segment->getRowKeyRange(); - - // meet empty segment, try merge it - if (segment_snap->getRows() == 0) - { - // release segment_snap before checkSegmentUpdate, otherwise this segment is still in update status. - segment_snap = {}; - checkSegmentUpdate(dm_context, segment, ThreadType::BG_GC); - continue; - } - - try - { - // Check whether we should apply gc on this segment - bool should_compact = false; - if (GC::shouldCompactDeltaWithStable( - *dm_context, - segment_snap, - segment_range, - global_context.getSettingsRef().dt_bg_gc_delta_delete_ratio_to_trigger_gc, - log)) - { - should_compact = true; - } - else if (segment->getLastCheckGCSafePoint() < gc_safe_point) - { - // Avoid recheck this segment when gc_safe_point doesn't change regardless whether we trigger this segment's DeltaMerge or not. - // Because after we calculate StableProperty and compare it with this gc_safe_point, - // there is no need to recheck it again using the same gc_safe_point. - // On the other hand, if it should do DeltaMerge using this gc_safe_point, and the DeltaMerge is interruptted by other process, - // it's still worth to wait another gc_safe_point to check this segment again. - segment->setLastCheckGCSafePoint(gc_safe_point); - dm_context->min_version = gc_safe_point; - - // calculate StableProperty if needed - if (!segment->getStable()->isStablePropertyCached()) - segment->getStable()->calculateStableProperty(*dm_context, segment_range, isCommonHandle()); - - should_compact = GC::shouldCompactStable( - segment, - gc_safe_point, - global_context.getSettingsRef().dt_bg_gc_ratio_threhold_to_trigger_gc, - log); - } - bool finish_gc_on_segment = false; - if (should_compact) - { - if (segment = segmentMergeDelta(*dm_context, segment, MergeDeltaReason::BackgroundGCThread, segment_snap); segment) - { - // Continue to check whether we need to apply more tasks on this segment - segment_snap = {}; - checkSegmentUpdate(dm_context, segment, ThreadType::BG_GC); - gc_segments_num++; - finish_gc_on_segment = true; - LOG_FMT_DEBUG( - log, - "Finish GC-merge-delta, segment={} table={}", - segment->simpleInfo(), - table_name); - } - else - { - LOG_FMT_DEBUG( - log, - "GC aborted, segment={} table={}", - segment->simpleInfo(), - table_name); - } - } - if (!finish_gc_on_segment) - LOG_FMT_TRACE( - log, - "GC skipped, segment={} table={}", - segment->simpleInfo(), - table_name); - } - catch (Exception & e) - { - e.addMessage(fmt::format("while apply gc Segment [{}] [range={}] [table={}]", segment_id, segment_range.toDebugString(), table_name)); - e.rethrow(); - } - } - - if (gc_segments_num != 0) - { - LOG_FMT_DEBUG(log, "Finish GC, gc_segments_num={}", gc_segments_num); - } - return gc_segments_num; -} - void DeltaMergeStore::check(const Context & /*db_context*/) { std::shared_lock lock(read_write_mutex); diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp new file mode 100644 index 00000000000..0627734ccd2 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp @@ -0,0 +1,450 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +namespace CurrentMetrics +{ +extern const Metric DT_SnapshotOfDeltaMerge; +} // namespace CurrentMetrics + +namespace DB +{ + +namespace FailPoints +{ +extern const char pause_before_dt_background_delta_merge[]; +extern const char pause_until_dt_background_delta_merge[]; +} // namespace FailPoints + +namespace DM +{ + +void DeltaMergeStore::setUpBackgroundTask(const DMContextPtr & dm_context) +{ + // Callbacks for cleaning outdated DTFiles. Note that there is a chance + // that callbacks is called after the `DeltaMergeStore` dropped, we must + // make the callbacks safe. + ExternalPageCallbacks callbacks; + callbacks.ns_id = storage_pool->getNamespaceId(); + callbacks.scanner = [path_pool_weak_ref = std::weak_ptr(path_pool), file_provider = global_context.getFileProvider()]() { + ExternalPageCallbacks::PathAndIdsVec path_and_ids_vec; + + // If the StoragePathPool is invalid, meaning we call `scanner` after dropping the table, + // simply return an empty list is OK. + auto path_pool = path_pool_weak_ref.lock(); + if (!path_pool) + return path_and_ids_vec; + + // Return the DTFiles on disks. + auto delegate = path_pool->getStableDiskDelegator(); + // Only return the DTFiles can be GC. The page id of not able to be GC files, which is being ingested or in the middle of + // SegmentSplit/Merge/MergeDelta, is not yet applied + // to PageStorage is marked as not able to be GC, so we don't return them and run the `remover` + DMFile::ListOptions options; + options.only_list_can_gc = true; + for (auto & root_path : delegate.listPaths()) + { + std::set ids_under_path; + auto file_ids_in_current_path = DMFile::listAllInPath(file_provider, root_path, options); + path_and_ids_vec.emplace_back(root_path, std::move(file_ids_in_current_path)); + } + return path_and_ids_vec; + }; + callbacks.remover = [path_pool_weak_ref = std::weak_ptr(path_pool), // + file_provider = global_context.getFileProvider(), + logger = log](const ExternalPageCallbacks::PathAndIdsVec & path_and_ids_vec, const std::set & valid_ids) { + // If the StoragePathPool is invalid, meaning we call `remover` after dropping the table, + // simply skip is OK. + auto path_pool = path_pool_weak_ref.lock(); + if (!path_pool) + return; + + SYNC_FOR("before_DeltaMergeStore::callbacks_remover_remove"); + auto delegate = path_pool->getStableDiskDelegator(); + for (const auto & [path, ids] : path_and_ids_vec) + { + for (auto id : ids) + { + if (valid_ids.count(id)) + continue; + + // Note that page_id is useless here. + auto dmfile = DMFile::restore(file_provider, id, /* page_id= */ 0, path, DMFile::ReadMetaMode::none()); + if (unlikely(!dmfile)) + { + // If the dtfile directory is not exist, it means `StoragePathPool::drop` have been + // called in another thread. Just try to clean if any id is left. + try + { + delegate.removeDTFile(id); + } + catch (DB::Exception & e) + { + // just ignore + } + LOG_FMT_INFO(logger, + "GC try remove useless DM file, but file not found and may have been removed, dmfile={}", + DMFile::getPathByStatus(path, id, DMFile::Status::READABLE)); + } + else if (dmfile->canGC()) + { + // StoragePathPool::drop may be called concurrently, ignore and continue next file if any exception thrown + String err_msg; + try + { + // scanner should only return dtfiles that can GC, + // just another check here. + delegate.removeDTFile(dmfile->fileId()); + dmfile->remove(file_provider); + } + catch (DB::Exception & e) + { + err_msg = e.message(); + } + catch (Poco::Exception & e) + { + err_msg = e.message(); + } + if (err_msg.empty()) + LOG_FMT_INFO(logger, "GC removed useless DM file, dmfile={}", dmfile->path()); + else + LOG_FMT_INFO(logger, "GC try remove useless DM file, but error happen, dmfile={} err_msg={}", dmfile->path(), err_msg); + } + } + } + }; + // remember to unregister it when shutdown + storage_pool->dataRegisterExternalPagesCallbacks(callbacks); + storage_pool->enableGC(); + + background_task_handle = background_pool.addTask([this] { return handleBackgroundTask(false); }); + + blockable_background_pool_handle = blockable_background_pool.addTask([this] { return handleBackgroundTask(true); }); + + // Do place delta index. + for (auto & [end, segment] : segments) + { + (void)end; + checkSegmentUpdate(dm_context, segment, ThreadType::Init); + } + + // Wake up to do place delta index tasks. + background_task_handle->wake(); + blockable_background_pool_handle->wake(); +} + + +bool DeltaMergeStore::updateGCSafePoint() +{ + if (auto pd_client = global_context.getTMTContext().getPDClient(); !pd_client->isMock()) + { + auto safe_point = PDClientHelper::getGCSafePointWithRetry( + pd_client, + /* ignore_cache= */ false, + global_context.getSettingsRef().safe_point_update_interval_seconds); + latest_gc_safe_point.store(safe_point, std::memory_order_release); + return true; + } + return false; +} + +bool DeltaMergeStore::handleBackgroundTask(bool heavy) +{ + auto task = background_tasks.nextTask(heavy, log); + if (!task) + return false; + + // Update GC safe point before background task + // Foreground task don't get GC safe point from remote, but we better make it as up to date as possible. + if (updateGCSafePoint()) + { + /// Note that `task.dm_context->db_context` will be free after query is finish. We should not use that in background task. + task.dm_context->min_version = latest_gc_safe_point.load(std::memory_order_relaxed); + LOG_FMT_DEBUG(log, "Task {} GC safe point: {}", toString(task.type), task.dm_context->min_version); + } + + SegmentPtr left, right; + ThreadType type = ThreadType::Write; + try + { + switch (task.type) + { + case TaskType::Split: + std::tie(left, right) = segmentSplit(*task.dm_context, task.segment, false); + type = ThreadType::BG_Split; + break; + case TaskType::Merge: + segmentMerge(*task.dm_context, {task.segment, task.next_segment}, false); + type = ThreadType::BG_Merge; + break; + case TaskType::MergeDelta: + { + FAIL_POINT_PAUSE(FailPoints::pause_before_dt_background_delta_merge); + left = segmentMergeDelta(*task.dm_context, task.segment, MergeDeltaReason::BackgroundThreadPool); + type = ThreadType::BG_MergeDelta; + // Wake up all waiting threads if failpoint is enabled + FailPointHelper::disableFailPoint(FailPoints::pause_until_dt_background_delta_merge); + break; + } + case TaskType::Compact: + task.segment->compactDelta(*task.dm_context); + left = task.segment; + type = ThreadType::BG_Compact; + break; + case TaskType::Flush: + task.segment->flushCache(*task.dm_context); + // After flush cache, better place delta index. + task.segment->placeDeltaIndex(*task.dm_context); + left = task.segment; + type = ThreadType::BG_Flush; + break; + case TaskType::PlaceIndex: + task.segment->placeDeltaIndex(*task.dm_context); + break; + default: + throw Exception(fmt::format("Unsupported task type: {}", toString(task.type))); + } + } + catch (const Exception & e) + { + LOG_FMT_ERROR( + log, + "Execute task on segment failed, task={} segment={}{} err={}", + DeltaMergeStore::toString(task.type), + task.segment->simpleInfo(), + ((bool)task.next_segment ? (fmt::format(" next_segment={}", task.next_segment->simpleInfo())) : ""), + e.message()); + e.rethrow(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + throw; + } + + // continue to check whether we need to apply more tasks after this task is ended. + if (left) + checkSegmentUpdate(task.dm_context, left, type); + if (right) + checkSegmentUpdate(task.dm_context, right, type); + + return true; +} + +namespace GC +{ +// Returns true if it needs gc. +// This is for optimization purpose, does not mean to be accurate. +bool shouldCompactStable(const SegmentPtr & seg, DB::Timestamp gc_safepoint, double ratio_threshold, const LoggerPtr & log) +{ + // Always GC. + if (ratio_threshold < 1.0) + return true; + + const auto & property = seg->getStable()->getStableProperty(); + LOG_FMT_TRACE(log, "{}", property.toDebugString()); + // No data older than safe_point to GC. + if (property.gc_hint_version > gc_safepoint) + return false; + // A lot of MVCC versions to GC. + if (property.num_versions > property.num_rows * ratio_threshold) + return true; + // A lot of non-effective MVCC versions to GC. + if (property.num_versions > property.num_puts * ratio_threshold) + return true; + return false; +} + +bool shouldCompactDeltaWithStable(const DMContext & context, const SegmentSnapshotPtr & snap, const RowKeyRange & segment_range, double ratio_threshold, const LoggerPtr & log) +{ + auto actual_delete_range = snap->delta->getSquashDeleteRange().shrink(segment_range); + if (actual_delete_range.none()) + return false; + + auto [delete_rows, delete_bytes] = snap->stable->getApproxRowsAndBytes(context, actual_delete_range); + + auto stable_rows = snap->stable->getRows(); + auto stable_bytes = snap->stable->getBytes(); + + LOG_FMT_TRACE(log, "delete range rows [{}], delete_bytes [{}] stable_rows [{}] stable_bytes [{}]", delete_rows, delete_bytes, stable_rows, stable_bytes); + + // 1. for small tables, the data may just reside in delta and stable_rows may be 0, + // so the `=` in `>=` is needed to cover the scenario when set tiflash replica of small tables to 0. + // (i.e. `actual_delete_range` is not none, but `delete_rows` and `stable_rows` are both 0). + // 2. the disadvantage of `=` in `>=` is that it may trigger an extra gc when write apply snapshot file to an empty segment, + // because before write apply snapshot file, it will write a delete range first, and will meet the following gc criteria. + // But the cost should be really minor because merge delta on an empty segment should be very fast. + // What's more, we can ignore this kind of delete range in future to avoid this extra gc. + bool should_compact = (delete_rows >= stable_rows * ratio_threshold) || (delete_bytes >= stable_bytes * ratio_threshold); + return should_compact; +} +} // namespace GC + +UInt64 DeltaMergeStore::onSyncGc(Int64 limit) +{ + if (shutdown_called.load(std::memory_order_relaxed)) + return 0; + + if (!updateGCSafePoint()) + return 0; + + { + std::shared_lock lock(read_write_mutex); + // avoid gc on empty tables + if (segments.size() == 1) + { + const auto & seg = segments.begin()->second; + if (seg->getEstimatedRows() == 0) + return 0; + } + } + + DB::Timestamp gc_safe_point = latest_gc_safe_point.load(std::memory_order_acquire); + LOG_FMT_TRACE(log, + "GC on table {} start with key: {}, gc_safe_point: {}, max gc limit: {}", + table_name, + next_gc_check_key.toDebugString(), + gc_safe_point, + limit); + + UInt64 check_segments_num = 0; + Int64 gc_segments_num = 0; + while (gc_segments_num < limit) + { + // If the store is shut down, give up running GC on it. + if (shutdown_called.load(std::memory_order_relaxed)) + break; + + auto dm_context = newDMContext(global_context, global_context.getSettingsRef(), "onSyncGc"); + SegmentPtr segment; + SegmentSnapshotPtr segment_snap; + { + std::shared_lock lock(read_write_mutex); + + auto segment_it = segments.upper_bound(next_gc_check_key.toRowKeyValueRef()); + if (segment_it == segments.end()) + segment_it = segments.begin(); + + // we have check all segments, stop here + if (check_segments_num >= segments.size()) + break; + check_segments_num++; + + segment = segment_it->second; + next_gc_check_key = segment_it->first.toRowKeyValue(); + segment_snap = segment->createSnapshot(*dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfDeltaMerge); + } + + assert(segment != nullptr); + if (segment->hasAbandoned() || segment_snap == nullptr) + continue; + + const auto segment_id = segment->segmentId(); + RowKeyRange segment_range = segment->getRowKeyRange(); + + // meet empty segment, try merge it + if (segment_snap->getRows() == 0) + { + // release segment_snap before checkSegmentUpdate, otherwise this segment is still in update status. + segment_snap = {}; + checkSegmentUpdate(dm_context, segment, ThreadType::BG_GC); + continue; + } + + try + { + // Check whether we should apply gc on this segment + bool should_compact = false; + if (GC::shouldCompactDeltaWithStable( + *dm_context, + segment_snap, + segment_range, + global_context.getSettingsRef().dt_bg_gc_delta_delete_ratio_to_trigger_gc, + log)) + { + should_compact = true; + } + else if (segment->getLastCheckGCSafePoint() < gc_safe_point) + { + // Avoid recheck this segment when gc_safe_point doesn't change regardless whether we trigger this segment's DeltaMerge or not. + // Because after we calculate StableProperty and compare it with this gc_safe_point, + // there is no need to recheck it again using the same gc_safe_point. + // On the other hand, if it should do DeltaMerge using this gc_safe_point, and the DeltaMerge is interruptted by other process, + // it's still worth to wait another gc_safe_point to check this segment again. + segment->setLastCheckGCSafePoint(gc_safe_point); + dm_context->min_version = gc_safe_point; + + // calculate StableProperty if needed + if (!segment->getStable()->isStablePropertyCached()) + segment->getStable()->calculateStableProperty(*dm_context, segment_range, isCommonHandle()); + + should_compact = GC::shouldCompactStable( + segment, + gc_safe_point, + global_context.getSettingsRef().dt_bg_gc_ratio_threhold_to_trigger_gc, + log); + } + bool finish_gc_on_segment = false; + if (should_compact) + { + if (segment = segmentMergeDelta(*dm_context, segment, MergeDeltaReason::BackgroundGCThread, segment_snap); segment) + { + // Continue to check whether we need to apply more tasks on this segment + segment_snap = {}; + checkSegmentUpdate(dm_context, segment, ThreadType::BG_GC); + gc_segments_num++; + finish_gc_on_segment = true; + LOG_FMT_DEBUG( + log, + "Finish GC-merge-delta, segment={} table={}", + segment->simpleInfo(), + table_name); + } + else + { + LOG_FMT_DEBUG( + log, + "GC aborted, segment={} table={}", + segment->simpleInfo(), + table_name); + } + } + if (!finish_gc_on_segment) + LOG_FMT_TRACE( + log, + "GC skipped, segment={} table={}", + segment->simpleInfo(), + table_name); + } + catch (Exception & e) + { + e.addMessage(fmt::format("while apply gc Segment [{}] [range={}] [table={}]", segment_id, segment_range.toDebugString(), table_name)); + e.rethrow(); + } + } + + if (gc_segments_num != 0) + { + LOG_FMT_DEBUG(log, "Finish GC, gc_segments_num={}", gc_segments_num); + } + return gc_segments_num; +} + +} // namespace DM +} // namespace DB From d1f7d2500f1fcf405b23c72a753265b5d23901d6 Mon Sep 17 00:00:00 2001 From: Wish Date: Tue, 13 Sep 2022 17:38:29 +0800 Subject: [PATCH 14/17] storage: merge in gc thread Signed-off-by: Wish --- dbms/src/Storages/DeltaMerge/DMContext.h | 6 + .../Storages/DeltaMerge/DeltaMergeStore.cpp | 56 +--- .../src/Storages/DeltaMerge/DeltaMergeStore.h | 25 +- .../DeltaMerge/DeltaMergeStore_InternalBg.cpp | 259 ++++++++++++------ 4 files changed, 211 insertions(+), 135 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/DMContext.h b/dbms/src/Storages/DeltaMerge/DMContext.h index 76da0714844..59320665011 100644 --- a/dbms/src/Storages/DeltaMerge/DMContext.h +++ b/dbms/src/Storages/DeltaMerge/DMContext.h @@ -72,6 +72,10 @@ struct DMContext : private boost::noncopyable const size_t delta_small_column_file_bytes; // The expected stable pack rows. const size_t stable_pack_rows; + // The rows of segment to be regarded as small. Small segments will be merged. + const size_t small_segment_rows; + // The bytes of segment to be regarded as small. Small segments will be merged. + const size_t small_segment_bytes; // The number of points to check for calculating region split. const size_t region_split_check_points = 128; @@ -111,6 +115,8 @@ struct DMContext : private boost::noncopyable , delta_small_column_file_rows(settings.dt_segment_delta_small_column_file_rows) , delta_small_column_file_bytes(settings.dt_segment_delta_small_column_file_size) , stable_pack_rows(settings.dt_segment_stable_pack_rows) + , small_segment_rows(settings.dt_segment_limit_rows / 3) + , small_segment_bytes(settings.dt_segment_limit_size / 3) , enable_logical_split(settings.dt_enable_logical_split) , read_delta_only(settings.dt_read_delta_only) , read_stable_only(settings.dt_read_stable_only) diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp index 018c601d609..cd291e65882 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp @@ -105,7 +105,6 @@ std::pair DeltaMergeStore::MergeDeltaTaskPool::tryAddTask(const Back switch (task.type) { case TaskType::Split: - case TaskType::Merge: case TaskType::MergeDelta: is_heavy = true; // reserve some task space for light tasks @@ -1178,8 +1177,6 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const || delta_bytes - delta_last_try_split_bytes >= delta_cache_limit_bytes)) || (segment_rows >= segment_limit_rows * 3 || segment_bytes >= segment_limit_bytes * 3); - bool should_merge = segment_rows < segment_limit_rows / 3 && segment_bytes < segment_limit_bytes / 3; - // Don't do compact on starting up. bool should_compact = (thread_type != ThreadType::Init) && std::max(static_cast(column_file_count) - delta_last_try_compact_column_files, 0) >= 10; @@ -1236,7 +1233,7 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const { delta_last_try_flush_rows = delta_rows; delta_last_try_flush_bytes = delta_bytes; - try_add_background_task(BackgroundTask{TaskType::Flush, dm_context, segment, {}}); + try_add_background_task(BackgroundTask{TaskType::Flush, dm_context, segment}); } } } @@ -1246,36 +1243,6 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const if (segment->getDelta()->isUpdating()) return; - /// Now start trying structure update. - - auto get_merge_sibling = [&]() -> SegmentPtr { - /// For complexity reason, currently we only try to merge with next segment. Normally it is good enough. - - // The last segment cannot be merged. - if (segment->getRowKeyRange().isEndInfinite()) - return {}; - SegmentPtr next_segment; - { - std::shared_lock read_write_lock(read_write_mutex); - - auto it = segments.find(segment->getRowKeyRange().getEnd()); - // check legality - if (it == segments.end()) - return {}; - auto & cur_segment = it->second; - if (cur_segment.get() != segment.get()) - return {}; - ++it; - if (it == segments.end()) - return {}; - next_segment = it->second; - auto limit = dm_context->segment_limit_rows / 5; - if (next_segment->getEstimatedRows() >= limit) - return {}; - } - return next_segment; - }; - auto try_fg_merge_delta = [&]() -> SegmentPtr { // If the table is already dropped, don't trigger foreground merge delta when executing `remove region peer`, // or the raft-log apply threads may be blocked. @@ -1303,7 +1270,7 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const if (should_background_merge_delta) { delta_last_try_merge_delta_rows = delta_rows; - try_add_background_task(BackgroundTask{TaskType::MergeDelta, dm_context, segment, {}}); + try_add_background_task(BackgroundTask{TaskType::MergeDelta, dm_context, segment}); return true; } return false; @@ -1313,12 +1280,12 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const { delta_last_try_split_rows = delta_rows; delta_last_try_split_bytes = delta_bytes; - try_add_background_task(BackgroundTask{TaskType::Split, dm_context, seg, {}}); + try_add_background_task(BackgroundTask{TaskType::Split, dm_context, seg}); return true; } return false; }; - auto try_fg_split = [&](const SegmentPtr & my_segment) -> bool { + auto try_fg_split = [&](const SegmentPtr & my_segment) { auto my_segment_size = my_segment->getEstimatedBytes(); auto my_should_split = my_segment_size >= dm_context->segment_force_split_bytes; if (my_should_split && !my_segment->isSplitForbidden()) @@ -1334,15 +1301,6 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const } return false; }; - auto try_bg_merge = [&]() { - SegmentPtr merge_sibling; - if (should_merge && (merge_sibling = get_merge_sibling())) - { - try_add_background_task(BackgroundTask{TaskType::Merge, dm_context, segment, merge_sibling}); - return true; - } - return false; - }; auto try_bg_compact = [&]() { /// Compact task should be a really low priority task. /// And if the segment is flushing, @@ -1352,7 +1310,7 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const if (should_compact && !segment->isFlushing()) { delta_last_try_compact_column_files = column_file_count; - try_add_background_task(BackgroundTask{TaskType::Compact, dm_context, segment, {}}); + try_add_background_task(BackgroundTask{TaskType::Compact, dm_context, segment}); return true; } return false; @@ -1361,7 +1319,7 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const if (should_place_delta_index) { delta_last_try_place_delta_index_rows = delta_rows; - try_add_background_task(BackgroundTask{TaskType::PlaceIndex, dm_context, segment, {}}); + try_add_background_task(BackgroundTask{TaskType::PlaceIndex, dm_context, segment}); return true; } return false; @@ -1406,8 +1364,6 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const if (try_bg_merge_delta()) return; } - if (try_bg_merge()) - return; if (try_bg_compact()) return; if (try_place_delta_index()) diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h index 7d4f9a6c1d9..9b7f0522d8d 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h @@ -173,7 +173,6 @@ class DeltaMergeStore : private boost::noncopyable enum TaskType { Split, - Merge, MergeDelta, Compact, Flush, @@ -221,8 +220,6 @@ class DeltaMergeStore : private boost::noncopyable { case Split: return "Split"; - case Merge: - return "Merge"; case MergeDelta: return "MergeDelta"; case Compact: @@ -259,7 +256,6 @@ class DeltaMergeStore : private boost::noncopyable DMContextPtr dm_context; SegmentPtr segment; - SegmentPtr next_segment; explicit operator bool() const { return segment != nullptr; } }; @@ -405,6 +401,27 @@ class DeltaMergeStore : private boost::noncopyable /// Iterator over all segments and apply gc jobs. UInt64 onSyncGc(Int64 limit); + /** + * Try to merge the segment in the current thread as the GC operation. + * This function may be blocking, and should be called in the GC background thread. + */ + SegmentPtr gcTrySegmentMerge(const DMContextPtr & dm_context, const SegmentPtr & segment); + + /** + * Try to merge delta in the current thread as the GC operation. + * This function may be blocking, and should be called in the GC background thread. + */ + SegmentPtr gcTrySegmentMergeDelta(const DMContextPtr & dm_context, const SegmentPtr & segment, DB::Timestamp gc_safe_point); + + /** + * Starting from the given base segment, find continuous segments that could be merged. + * + * When there are mergeable segments, the baseSegment is returned in index 0 and mergeable segments are then placed in order. + * It is ensured that there are at least 2 elements in the returned vector. + * When there is no mergeable segment, the returned vector will be empty. + */ + std::vector getMergeableSegments(const DMContextPtr & context, const SegmentPtr & baseSegment); + /// Apply DDL `commands` on `table_columns` void applyAlters(const AlterCommands & commands, // OptionTableInfoConstRef table_info, diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp index 0627734ccd2..920653c1b28 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp @@ -149,6 +149,51 @@ void DeltaMergeStore::setUpBackgroundTask(const DMContextPtr & dm_context) blockable_background_pool_handle->wake(); } +std::vector DeltaMergeStore::getMergeableSegments(const DMContextPtr & context, const SegmentPtr & baseSegment) +{ + // Last segment cannot be merged. + if (baseSegment->getRowKeyRange().isEndInfinite()) + return {}; + + // We only merge small segments into a larger one. + // Note: it is possible that there is a very small segment close to a very large segment. + // In this case, the small segment will not get merged. It is possible that we can allow + // segment merging for this case in future. + auto max_total_rows = context->small_segment_rows; + auto max_total_bytes = context->small_segment_bytes; + + std::vector results; + { + std::shared_lock lock(read_write_mutex); + + if (!isSegmentValid(lock, baseSegment)) + return {}; + + results.reserve(4); // In most cases we will only find <= 4 segments to merge. + results.emplace_back(baseSegment); + auto accumulated_rows = baseSegment->getEstimatedRows(); + auto accumulated_bytes = baseSegment->getEstimatedBytes(); + + auto it = segments.upper_bound(baseSegment->getRowKeyRange().getEnd()); + while (it != segments.end()) + { + const auto & this_seg = it->second; + const auto this_rows = this_seg->getEstimatedRows(); + const auto this_bytes = this_seg->getEstimatedBytes(); + if (accumulated_rows + this_rows >= max_total_rows || accumulated_bytes + this_bytes >= max_total_bytes) + break; + results.emplace_back(this_seg); + accumulated_rows += this_rows; + accumulated_bytes += this_bytes; + it++; + } + } + + if (results.size() < 2) + return {}; + + return results; +} bool DeltaMergeStore::updateGCSafePoint() { @@ -189,10 +234,6 @@ bool DeltaMergeStore::handleBackgroundTask(bool heavy) std::tie(left, right) = segmentSplit(*task.dm_context, task.segment, false); type = ThreadType::BG_Split; break; - case TaskType::Merge: - segmentMerge(*task.dm_context, {task.segment, task.next_segment}, false); - type = ThreadType::BG_Merge; - break; case TaskType::MergeDelta: { FAIL_POINT_PAUSE(FailPoints::pause_before_dt_background_delta_merge); @@ -225,10 +266,9 @@ bool DeltaMergeStore::handleBackgroundTask(bool heavy) { LOG_FMT_ERROR( log, - "Execute task on segment failed, task={} segment={}{} err={}", + "Execute task on segment failed, task={} segment={} err={}", DeltaMergeStore::toString(task.type), task.segment->simpleInfo(), - ((bool)task.next_segment ? (fmt::format(" next_segment={}", task.next_segment->simpleInfo())) : ""), e.message()); e.rethrow(); } @@ -249,6 +289,7 @@ bool DeltaMergeStore::handleBackgroundTask(bool heavy) namespace GC { + // Returns true if it needs gc. // This is for optimization purpose, does not mean to be accurate. bool shouldCompactStable(const SegmentPtr & seg, DB::Timestamp gc_safepoint, double ratio_threshold, const LoggerPtr & log) @@ -294,8 +335,124 @@ bool shouldCompactDeltaWithStable(const DMContext & context, const SegmentSnapsh bool should_compact = (delete_rows >= stable_rows * ratio_threshold) || (delete_bytes >= stable_bytes * ratio_threshold); return should_compact; } + } // namespace GC +SegmentPtr DeltaMergeStore::gcTrySegmentMerge(const DMContextPtr & dm_context, const SegmentPtr & segment) +{ + auto segment_rows = segment->getEstimatedRows(); + auto segment_bytes = segment->getEstimatedBytes(); + if (segment_rows >= dm_context->small_segment_rows || segment_bytes >= dm_context->small_segment_bytes) + { + LOG_FMT_TRACE( + log, + "GC - Merge skipped because current segment is not small, segment={} table={}", + segment->simpleInfo(), + table_name); + return {}; + } + + auto segments_to_merge = getMergeableSegments(dm_context, segment); + if (segments_to_merge.size() < 2) + { + LOG_FMT_TRACE( + log, + "GC - Merge skipped because cannot find adjacent segments to merge, segment={} table={}", + segment->simpleInfo(), + table_name); + return {}; + } + + LOG_FMT_DEBUG( + log, + "GC - Trigger Merge, segment={} table={}", + segment->simpleInfo(), + table_name); + auto new_segment = segmentMerge(*dm_context, segments_to_merge, false); + if (new_segment) + { + checkSegmentUpdate(dm_context, segment, ThreadType::BG_GC); + } + + return new_segment; +} + +SegmentPtr DeltaMergeStore::gcTrySegmentMergeDelta(const DMContextPtr & dm_context, const SegmentPtr & segment, DB::Timestamp gc_safe_point) +{ + SegmentSnapshotPtr segment_snap; + { + std::shared_lock lock(read_write_mutex); // TODO: Do we really need this lock? + segment_snap = segment->createSnapshot(*dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfDeltaMerge); + } + + if (segment->hasAbandoned() || !segment_snap) + { + LOG_FMT_TRACE( + log, + "GC - MergeDelta skipped because snapshot failed, segment={} table={}", + segment->simpleInfo(), + table_name); + return {}; + } + + RowKeyRange segment_range = segment->getRowKeyRange(); + + bool should_compact = false; + if (GC::shouldCompactDeltaWithStable( + *dm_context, + segment_snap, + segment_range, + global_context.getSettingsRef().dt_bg_gc_delta_delete_ratio_to_trigger_gc, + log)) + { + should_compact = true; + } + else if (segment->getLastCheckGCSafePoint() < gc_safe_point) + { + // Avoid recheck this segment when gc_safe_point doesn't change regardless whether we trigger this segment's DeltaMerge or not. + // Because after we calculate StableProperty and compare it with this gc_safe_point, + // there is no need to recheck it again using the same gc_safe_point. + // On the other hand, if it should do DeltaMerge using this gc_safe_point, and the DeltaMerge is interruptted by other process, + // it's still worth to wait another gc_safe_point to check this segment again. + segment->setLastCheckGCSafePoint(gc_safe_point); + dm_context->min_version = gc_safe_point; + + // calculate StableProperty if needed + if (!segment->getStable()->isStablePropertyCached()) + segment->getStable()->calculateStableProperty(*dm_context, segment_range, isCommonHandle()); + + should_compact = GC::shouldCompactStable( + segment, + gc_safe_point, + global_context.getSettingsRef().dt_bg_gc_ratio_threhold_to_trigger_gc, + log); + } + + if (!should_compact) + { + LOG_FMT_TRACE( + log, + "GC - MergeDelta skipped, segment={} table={}", + segment->simpleInfo(), + table_name); + return {}; + } + + LOG_FMT_DEBUG( + log, + "GC - Trigger MergeDelta, segment={} table={}", + segment->simpleInfo(), + table_name); + auto new_segment = segmentMergeDelta(*dm_context, segment, MergeDeltaReason::BackgroundGCThread, segment_snap); + if (new_segment) + { + segment_snap = {}; + checkSegmentUpdate(dm_context, segment, ThreadType::BG_GC); + } + + return new_segment; +} + UInt64 DeltaMergeStore::onSyncGc(Int64 limit) { if (shutdown_called.load(std::memory_order_relaxed)) @@ -333,7 +490,6 @@ UInt64 DeltaMergeStore::onSyncGc(Int64 limit) auto dm_context = newDMContext(global_context, global_context.getSettingsRef(), "onSyncGc"); SegmentPtr segment; - SegmentSnapshotPtr segment_snap; { std::shared_lock lock(read_write_mutex); @@ -348,101 +504,42 @@ UInt64 DeltaMergeStore::onSyncGc(Int64 limit) segment = segment_it->second; next_gc_check_key = segment_it->first.toRowKeyValue(); - segment_snap = segment->createSnapshot(*dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfDeltaMerge); } assert(segment != nullptr); - if (segment->hasAbandoned() || segment_snap == nullptr) - continue; - - const auto segment_id = segment->segmentId(); - RowKeyRange segment_range = segment->getRowKeyRange(); - - // meet empty segment, try merge it - if (segment_snap->getRows() == 0) - { - // release segment_snap before checkSegmentUpdate, otherwise this segment is still in update status. - segment_snap = {}; - checkSegmentUpdate(dm_context, segment, ThreadType::BG_GC); + if (segment->hasAbandoned()) continue; - } try { - // Check whether we should apply gc on this segment - bool should_compact = false; - if (GC::shouldCompactDeltaWithStable( - *dm_context, - segment_snap, - segment_range, - global_context.getSettingsRef().dt_bg_gc_delta_delete_ratio_to_trigger_gc, - log)) - { - should_compact = true; - } - else if (segment->getLastCheckGCSafePoint() < gc_safe_point) - { - // Avoid recheck this segment when gc_safe_point doesn't change regardless whether we trigger this segment's DeltaMerge or not. - // Because after we calculate StableProperty and compare it with this gc_safe_point, - // there is no need to recheck it again using the same gc_safe_point. - // On the other hand, if it should do DeltaMerge using this gc_safe_point, and the DeltaMerge is interruptted by other process, - // it's still worth to wait another gc_safe_point to check this segment again. - segment->setLastCheckGCSafePoint(gc_safe_point); - dm_context->min_version = gc_safe_point; - - // calculate StableProperty if needed - if (!segment->getStable()->isStablePropertyCached()) - segment->getStable()->calculateStableProperty(*dm_context, segment_range, isCommonHandle()); - - should_compact = GC::shouldCompactStable( - segment, - gc_safe_point, - global_context.getSettingsRef().dt_bg_gc_ratio_threhold_to_trigger_gc, - log); - } - bool finish_gc_on_segment = false; - if (should_compact) + SegmentPtr new_seg{}; + if (!new_seg) + new_seg = gcTrySegmentMerge(dm_context, segment); + if (!new_seg) + new_seg = gcTrySegmentMergeDelta(dm_context, segment, gc_safe_point); + + if (!new_seg) { - if (segment = segmentMergeDelta(*dm_context, segment, MergeDeltaReason::BackgroundGCThread, segment_snap); segment) - { - // Continue to check whether we need to apply more tasks on this segment - segment_snap = {}; - checkSegmentUpdate(dm_context, segment, ThreadType::BG_GC); - gc_segments_num++; - finish_gc_on_segment = true; - LOG_FMT_DEBUG( - log, - "Finish GC-merge-delta, segment={} table={}", - segment->simpleInfo(), - table_name); - } - else - { - LOG_FMT_DEBUG( - log, - "GC aborted, segment={} table={}", - segment->simpleInfo(), - table_name); - } - } - if (!finish_gc_on_segment) LOG_FMT_TRACE( log, - "GC skipped, segment={} table={}", + "GC - Skipped segment, segment={} table={}", segment->simpleInfo(), table_name); + continue; + } + + gc_segments_num++; } catch (Exception & e) { - e.addMessage(fmt::format("while apply gc Segment [{}] [range={}] [table={}]", segment_id, segment_range.toDebugString(), table_name)); + e.addMessage(fmt::format("Error while GC segment, segment={} table={}", segment->info(), table_name)); e.rethrow(); } } if (gc_segments_num != 0) - { LOG_FMT_DEBUG(log, "Finish GC, gc_segments_num={}", gc_segments_num); - } + return gc_segments_num; } From daa0eb86825e68956c41ec00241124e4205512af Mon Sep 17 00:00:00 2001 From: Wish Date: Tue, 13 Sep 2022 22:04:03 +0800 Subject: [PATCH 15/17] Add tests Signed-off-by: Wish --- dbms/src/Common/FailPoint.cpp | 1 + .../DeltaMerge/DeltaMergeStore_InternalBg.cpp | 12 +- .../DeltaMergeStore_InternalSegment.cpp | 3 + .../DeltaMerge/tests/MultiSegmentTestUtil.h | 2 +- .../tests/gtest_dm_delta_merge_store.cpp | 157 ++++++++++++++++++ 5 files changed, 172 insertions(+), 3 deletions(-) diff --git a/dbms/src/Common/FailPoint.cpp b/dbms/src/Common/FailPoint.cpp index ce2f065711a..f14d7a2d91a 100644 --- a/dbms/src/Common/FailPoint.cpp +++ b/dbms/src/Common/FailPoint.cpp @@ -79,6 +79,7 @@ std::unordered_map> FailPointHelper::f #define APPLY_FOR_FAILPOINTS(M) \ M(skip_check_segment_update) \ + M(gc_skip_update_safe_point) \ M(force_set_page_file_write_errno) \ M(force_split_io_size_4k) \ M(minimum_block_size_for_cross_join) \ diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp index 920653c1b28..bcdbe354e22 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp @@ -28,6 +28,7 @@ namespace DB namespace FailPoints { +extern const char gc_skip_update_safe_point[]; extern const char pause_before_dt_background_delta_merge[]; extern const char pause_until_dt_background_delta_merge[]; } // namespace FailPoints @@ -458,8 +459,15 @@ UInt64 DeltaMergeStore::onSyncGc(Int64 limit) if (shutdown_called.load(std::memory_order_relaxed)) return 0; - if (!updateGCSafePoint()) - return 0; + bool skip_update_safe_point = false; + fiu_do_on(FailPoints::gc_skip_update_safe_point, { + skip_update_safe_point = true; + }); + if (!skip_update_safe_point) + { + if (!updateGCSafePoint()) + return 0; + } { std::shared_lock lock(read_write_mutex); diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp index 39a6e58d0c6..5447423dfad 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalSegment.cpp @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include @@ -192,6 +193,8 @@ SegmentPtr DeltaMergeStore::segmentMerge(DMContext & dm_context, const std::vect if (seg->flushCache(dm_context)) break; + SYNC_FOR("before_DeltaMergeStore::segmentMerge|retry_flush"); + // Else: retry. Flush could fail. Typical cases: // #1. The segment is abandoned (due to an update is finished) // #2. There is another flush in progress, for example, triggered in background diff --git a/dbms/src/Storages/DeltaMerge/tests/MultiSegmentTestUtil.h b/dbms/src/Storages/DeltaMerge/tests/MultiSegmentTestUtil.h index 100f27912e6..429def67d41 100644 --- a/dbms/src/Storages/DeltaMerge/tests/MultiSegmentTestUtil.h +++ b/dbms/src/Storages/DeltaMerge/tests/MultiSegmentTestUtil.h @@ -84,7 +84,7 @@ class MultiSegmentTestUtil : private boost::noncopyable { UNUSED(_key); LOG_FMT_INFO(log, "Segment #{}: Range = {}", segment_idx, seg->getRowKeyRange().toDebugString()); - rows_by_segments[segment_idx] = seg->getStable()->getRows(); + rows_by_segments[segment_idx] = seg->getEstimatedRows(); expected_stable_rows[segment_idx] = seg->getStable()->getRows(); expected_delta_rows[segment_idx] = seg->getDelta()->getRows(); segment_idx++; diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp index bca226c8b7b..c10dbd6df35 100644 --- a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_merge_store.cpp @@ -41,6 +41,7 @@ namespace DB { namespace FailPoints { +extern const char gc_skip_update_safe_point[]; extern const char pause_before_dt_background_delta_merge[]; extern const char pause_until_dt_background_delta_merge[]; extern const char force_triggle_background_merge_delta[]; @@ -3384,6 +3385,162 @@ try CATCH +class DeltaMergeStoreBackgroundTest + : public DB::base::TiFlashStorageTestBasic +{ +public: + void SetUp() override + { + FailPointHelper::enableFailPoint(FailPoints::gc_skip_update_safe_point); + + try + { + TiFlashStorageTestBasic::SetUp(); + setupDMStore(); + // Split into 4 segments. + helper = std::make_unique(*db_context); + helper->prepareSegments(store, 50, DMTestEnv::PkType::CommonHandle); + } + CATCH + } + + void TearDown() override + { + TiFlashStorageTestBasic::TearDown(); + FailPointHelper::disableFailPoint(FailPoints::gc_skip_update_safe_point); + } + + void setupDMStore() + { + auto cols = DMTestEnv::getDefaultColumns(DMTestEnv::PkType::CommonHandle); + store = std::make_shared(*db_context, + false, + "test", + DB::base::TiFlashStorageTestBasic::getCurrentFullTestName(), + 101, + *cols, + (*cols)[0], + true, + 1, + DeltaMergeStore::Settings()); + dm_context = store->newDMContext(*db_context, db_context->getSettingsRef(), DB::base::TiFlashStorageTestBasic::getCurrentFullTestName()); + } + +protected: + std::unique_ptr helper{}; + DeltaMergeStorePtr store; + DMContextPtr dm_context; +}; + +TEST_F(DeltaMergeStoreBackgroundTest, GCWillMergeMultipleSegments) +try +{ + ASSERT_EQ(store->segments.size(), 4); + auto gc_n = store->onSyncGc(1); + ASSERT_EQ(store->segments.size(), 1); + ASSERT_EQ(gc_n, 1); +} +CATCH + +TEST_F(DeltaMergeStoreBackgroundTest, GCOnlyMergeSmallSegments) +try +{ + UInt64 gc_n = 0; + + // Note: initially we have 4 segments, each segment contains 50 rows. + + ASSERT_EQ(store->segments.size(), 4); + db_context->getGlobalContext().getSettingsRef().dt_segment_limit_rows = 10; + gc_n = store->onSyncGc(100); + ASSERT_EQ(store->segments.size(), 4); + ASSERT_EQ(gc_n, 0); + + // In this case, merge two segments will exceed small_segment_rows, so no merge will happen + db_context->getGlobalContext().getSettingsRef().dt_segment_limit_rows = 55 * 3; + gc_n = store->onSyncGc(100); + ASSERT_EQ(store->segments.size(), 4); + ASSERT_EQ(gc_n, 0); + + // In this case, we will only merge two segments and then stop. + // [50, 50, 50, 50] => [100, 100] + db_context->getGlobalContext().getSettingsRef().dt_segment_limit_rows = 105 * 3; + gc_n = store->onSyncGc(100); + ASSERT_EQ(store->segments.size(), 2); + ASSERT_EQ(gc_n, 2); + helper->resetExpectedRows(); + ASSERT_EQ(helper->rows_by_segments[0], 100); + ASSERT_EQ(helper->rows_by_segments[1], 100); + + gc_n = store->onSyncGc(100); + ASSERT_EQ(store->segments.size(), 2); + ASSERT_EQ(gc_n, 0); + helper->verifyExpectedRowsForAllSegments(); +} +CATCH + +TEST_F(DeltaMergeStoreBackgroundTest, GCMergeAndStop) +try +{ + UInt64 gc_n = 0; + + // Note: initially we have 4 segments, each segment contains 50 rows. + + ASSERT_EQ(store->segments.size(), 4); + + // In this case, we will only merge two segments and then stop. + // [50, 50, 50, 50] => [100, 50, 50] + db_context->getGlobalContext().getSettingsRef().dt_segment_limit_rows = 105 * 3; + gc_n = store->onSyncGc(1); + ASSERT_EQ(store->segments.size(), 3); + ASSERT_EQ(gc_n, 1); + helper->resetExpectedRows(); + ASSERT_EQ(helper->rows_by_segments[0], 100); + ASSERT_EQ(helper->rows_by_segments[1], 50); + ASSERT_EQ(helper->rows_by_segments[2], 50); +} +CATCH + +TEST_F(DeltaMergeStoreBackgroundTest, GCMergeWhileFlushing) +try +{ + ASSERT_EQ(store->segments.size(), 4); + + Block block = DMTestEnv::prepareSimpleWriteBlock(0, 500, false, DMTestEnv::PkType::CommonHandle, 10 /* new tso */); + store->write(*db_context, db_context->getSettingsRef(), block); + + // Currently, when there is a flush in progress, the segment merge in GC thread will be blocked. + + auto sp_flush_commit = SyncPointCtl::enableInScope("before_ColumnFileFlushTask::commit"); + auto sp_merge_flush_retry = SyncPointCtl::enableInScope("before_DeltaMergeStore::segmentMerge|retry_flush"); + + auto th_flush = std::async([&]() { + auto result = store->segments.begin()->second->flushCache(*dm_context); + ASSERT_TRUE(result); + }); + + sp_flush_commit.waitAndPause(); + + auto th_gc = std::async([&]() { + auto gc_n = store->onSyncGc(1); + ASSERT_EQ(gc_n, 1); + ASSERT_EQ(store->segments.size(), 1); + }); + + // Expect merge triggered by GC is retrying... because there is a flush in progress. + sp_merge_flush_retry.waitAndPause(); + + // Finish the flush. + sp_flush_commit.next(); + sp_flush_commit.disable(); + th_flush.wait(); + + // The merge in GC should continue without any further retries. + sp_merge_flush_retry.next(); + th_gc.wait(); +} +CATCH + + } // namespace tests } // namespace DM } // namespace DB From 69cd880d6de169d0c43a5706e45db10d642ad4a9 Mon Sep 17 00:00:00 2001 From: Wish Date: Thu, 15 Sep 2022 10:03:31 +0800 Subject: [PATCH 16/17] Verify the snapshot and the segment as usual Signed-off-by: Wish --- .../DeltaMerge/DeltaMergeStore_InternalBg.cpp | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp index 389443381c1..89e5072a188 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp @@ -476,18 +476,25 @@ SegmentPtr DeltaMergeStore::gcTrySegmentMergeDelta(const DMContextPtr & dm_conte { SegmentSnapshotPtr segment_snap; { - std::shared_lock lock(read_write_mutex); // TODO: Do we really need this lock? - segment_snap = segment->createSnapshot(*dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfDeltaMerge); - } + std::shared_lock lock(read_write_mutex); - if (segment->hasAbandoned() || !segment_snap) - { - LOG_FMT_TRACE( - log, - "GC - MergeDelta skipped because snapshot failed, segment={} table={}", - segment->simpleInfo(), - table_name); - return {}; + // The segment we just retrieved may be dropped from the map. Let's verify it again before creating a snapshot. + if (!isSegmentValid(lock, segment)) + { + LOG_FMT_TRACE(log, "GC - Skip checking MergeDelta because not valid, segment={} table={}", segment->simpleInfo(), table_name); + return {}; + } + + segment_snap = segment->createSnapshot(*dm_context, /* for_update */ true, CurrentMetrics::DT_SnapshotOfDeltaMerge); + if (!segment_snap) + { + LOG_FMT_TRACE( + log, + "GC - Skip checking MergeDelta because snapshot failed, segment={} table={}", + segment->simpleInfo(), + table_name); + return {}; + } } RowKeyRange segment_range = segment->getRowKeyRange(); From 436a61c0d5a4e02462b9f4b66928d5bf5db004ea Mon Sep 17 00:00:00 2001 From: Wish Date: Thu, 15 Sep 2022 12:08:01 +0800 Subject: [PATCH 17/17] Address comments Signed-off-by: Wish --- dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp index 89e5072a188..b40cc387613 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore_InternalBg.cpp @@ -458,7 +458,7 @@ SegmentPtr DeltaMergeStore::gcTrySegmentMerge(const DMContextPtr & dm_context, c return {}; } - LOG_FMT_DEBUG( + LOG_FMT_INFO( log, "GC - Trigger Merge, segment={} table={}", segment->simpleInfo(),