From 72e66606640f3877a01450a0bcf0c86b5e22c499 Mon Sep 17 00:00:00 2001 From: Yugabyte CI Date: Tue, 6 Aug 2024 18:21:04 -0700 Subject: [PATCH] [BACKPORT pg15-cherrypicks] all: Bulk port from master - 71 Summary: 50931bf872 [#23273] yugabyted: Fix `yugabyted configure_read_replica` commands. 64e1bf8eef [#23278] CDCSDK: Handle non-eligible tables cleanup with drop table while loading CDC stream ce80f7aa9a [#13358] YSQL: Refactor DDL Atomicity Stress Test Excluded: 6d40d27561 [#23407] YSQL: clean up compound BNL logic 5cb74a7cfe [PLAT-14164] New Alert for clock drift f39c76cb99 [PLAT-14800] Fix yb.allow_db_version_more_than_yba_version being insufficient for YBA/DB version checks a42549e3bd [#23377] DocDB: Implement the way to apply vector index updates to DocDB 3923ec56b5 [PLAT-14749][Platform]Add a warning message to image upgrade dialog 709cd92d01 [PLAT-14848] postgres.service file did not have RestartSec filled out da10672cf1 [#23069] docdb: implemented per-iterator readahead for sequential reads f439c8aca6 [PLAT-14852]: Do not raise error when JWT_JWKS_URL has valid value and JWT has empty keyset Test Plan: Jenkins: rebase: pg15-cherrypicks Reviewers: jason, tfoucher Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D37095 --- bin/yugabyted | 13 +- .../com/yugabyte/yw/common/AlertTemplate.java | 3 +- .../com/yugabyte/yw/common/ReleasesUtils.java | 3 + .../main/resources/alert/alert_templates.yml | 25 + .../postgres/V366__Alert_Clock_Drift.sql | 21 + .../resources/health/node_health.py.template | 13 +- .../src/main/resources/swagger-strict.json | 6 +- managed/src/main/resources/swagger.json | 6 +- .../yugabyte/yw/common/ReleasesUtilsTest.java | 2 + .../UpgradeLinuxVersionModal.tsx | 23 +- managed/ui/src/translations/en.json | 3 +- managed/ui/src/utils/UniverseUtils.js | 33 +- .../templates/yba-installer-postgres.yml | 2 +- src/yb/common/doc_hybrid_time.h | 4 + src/yb/docdb/CMakeLists.txt | 2 + src/yb/docdb/docdb_debug.cc | 11 +- src/yb/docdb/docdb_rocksdb_util.cc | 8 + src/yb/docdb/vector_index.h | 24 + src/yb/docdb/vector_index_docdb-test.cc | 69 +++ src/yb/docdb/vector_index_update.cc | 116 ++++ src/yb/docdb/vector_index_update.h | 62 +++ src/yb/dockv/key_entry_value.h | 1 + src/yb/dockv/primitive_value.cc | 152 ++++-- src/yb/dockv/primitive_value.h | 26 + src/yb/dockv/value_type.h | 3 + src/yb/integration-tests/cdcsdk_ysql-test.cc | 56 ++ .../integration-tests/cdcsdk_ysql_test_base.h | 2 + .../integration-tests/tablet-split-itest.cc | 5 +- src/yb/master/xrepl_catalog_manager.cc | 51 +- src/yb/rocksdb/CMakeLists.txt | 1 + src/yb/rocksdb/db/readahead_test.cc | 511 ++++++++++++++++++ src/yb/rocksdb/statistics.h | 4 + .../rocksdb/table/block_based_table_reader.cc | 425 ++++++++++----- .../rocksdb/table/block_based_table_reader.h | 34 +- src/yb/rocksdb/table/index_reader.cc | 12 +- src/yb/rocksdb/table/index_reader.h | 14 +- src/yb/rocksdb/util/file_reader_writer.h | 2 + src/yb/rocksdb/util/statistics.cc | 5 + src/yb/util/endian_util.h | 52 ++ src/yb/util/file_system.h | 2 + src/yb/util/file_system_posix.cc | 11 + src/yb/util/file_system_posix.h | 6 +- src/yb/util/range.h | 21 + src/yb/util/slice.cc | 4 +- src/yb/util/slice.h | 18 +- .../pgwrapper/pg_ddl_atomicity_stress-test.cc | 365 ++++++------- 46 files changed, 1766 insertions(+), 466 deletions(-) create mode 100644 managed/src/main/resources/db/migration/default_/postgres/V366__Alert_Clock_Drift.sql create mode 100644 src/yb/docdb/vector_index.h create mode 100644 src/yb/docdb/vector_index_docdb-test.cc create mode 100644 src/yb/docdb/vector_index_update.cc create mode 100644 src/yb/docdb/vector_index_update.h create mode 100644 src/yb/rocksdb/db/readahead_test.cc create mode 100644 src/yb/util/endian_util.h diff --git a/bin/yugabyted b/bin/yugabyted index 670ebc64e338..54bf745ada89 100755 --- a/bin/yugabyted +++ b/bin/yugabyted @@ -6864,6 +6864,8 @@ class ControlScript(object): data_constraints = args.data_placement_constraint.split(",") has_errors = False for constraint in data_constraints: + if ':' not in constraint: + has_errors = True cloud_info = constraint.split(":")[0].split(".") if (len(cloud_info) == 3): for value in cloud_info: @@ -6875,8 +6877,8 @@ class ControlScript(object): if has_errors: Output.print_out(Output.make_red("ERROR") + ": Incorrect value specified for --data_placement_constraint. " + - "Please specify comma sperated value with format " + - "- cloudprovider.region.zone") + "Please specify comma sperated values with the num of replicas with " + + "format - :") else: self.configs.temp_data[ "rr_data_placement_constraint"] = args.data_placement_constraint @@ -6900,6 +6902,8 @@ class ControlScript(object): data_constraints = args.data_placement_constraint.split(",") has_errors = False for constraint in data_constraints: + if ':' not in constraint: + has_errors = True cloud_info = constraint.split(".") if (len(cloud_info) == 3): for value in cloud_info: @@ -6911,9 +6915,8 @@ class ControlScript(object): if has_errors: Output.print_out( "Incorrect value specified for --data_placement_constraint. " + - "Please specify comma sperated value with format " + - "- cloudprovider.region.zone".format( - args.data_placement_constraint)) + "Please specify comma sperated values with the num of replicas with " + + "format - :") else: self.configs.temp_data[ "rr_data_placement_constraint"] = args.data_placement_constraint diff --git a/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java b/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java index 76de95942d99..4e84b76593d2 100644 --- a/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java +++ b/managed/src/main/java/com/yugabyte/yw/common/AlertTemplate.java @@ -81,5 +81,6 @@ public enum AlertTemplate { UNIVERSE_RELEASE_FILES_STATUS, HA_VERSION_MISMATCH, TABLET_PEERS_GUARDRAIL, - XCLUSTER_CONFIG_TABLE_BAD_STATE; + XCLUSTER_CONFIG_TABLE_BAD_STATE, + NODE_CLOCK_DRIFT; } diff --git a/managed/src/main/java/com/yugabyte/yw/common/ReleasesUtils.java b/managed/src/main/java/com/yugabyte/yw/common/ReleasesUtils.java index 911d7c6b6b94..2f749586e9d7 100644 --- a/managed/src/main/java/com/yugabyte/yw/common/ReleasesUtils.java +++ b/managed/src/main/java/com/yugabyte/yw/common/ReleasesUtils.java @@ -442,6 +442,9 @@ public ReleaseMetadata.HttpLocation httpLocationFromUrl(ReleaseArtifact artifact } public void validateVersionAgainstCurrentYBA(String version) { + if (confGetter.getGlobalConf(GlobalConfKeys.allowDbVersionMoreThanYbaVersion)) { + return; + } if (confGetter.getGlobalConf(GlobalConfKeys.skipVersionChecks)) { return; } diff --git a/managed/src/main/resources/alert/alert_templates.yml b/managed/src/main/resources/alert/alert_templates.yml index 06f00f1084ff..c8b4b34a6e06 100644 --- a/managed/src/main/resources/alert/alert_templates.yml +++ b/managed/src/main/resources/alert/alert_templates.yml @@ -1901,3 +1901,28 @@ templates: Percentage of tablet peers is high for universe '{{ $labels.source_name }}'. Current value is {{ $value | printf "%.0f" }} %. Affected nodes: {{ $labels.affected_node_names }} + + NODE_CLOCK_DRIFT: + name: High clock drift + description: Local clock on the node has drift too far from the actual time in the past 10 minutes. + queryTemplate: max by (universe_uuid) (yb_node_clock_drift_check_ms{universe_uuid="__universeUuid__"}) {{ query_condition }} {{ query_threshold }} + createForNewCustomer: true + defaultThresholdMap: + WARNING: + threshold: 200.0 + SEVERE: + threshold: 400.0 + targetType: UNIVERSE + defaultThresholdCondition: GREATER_THAN + defaultThresholdUnit: MILLISECOND + thresholdUnitName: ms + labels: + affected_node_names: >- + {{ range $index, $element := query "max by (universe_uuid, node_name) + (yb_node_clock_drift_check_ms{universe_uuid='{{ $labels.universe_uuid }}'}) + / 1000 {{ query_condition }} {{ query_threshold }}" }}{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }} + annotations: + summary: >- + Clock drift is high for universe '{{ $labels.source_name }}'. + Current drift from actual clock is {{ $value }} milliseconds. + Affected nodes: {{ $labels.affected_node_names }} \ No newline at end of file diff --git a/managed/src/main/resources/db/migration/default_/postgres/V366__Alert_Clock_Drift.sql b/managed/src/main/resources/db/migration/default_/postgres/V366__Alert_Clock_Drift.sql new file mode 100644 index 000000000000..b1cb5c7c4ffb --- /dev/null +++ b/managed/src/main/resources/db/migration/default_/postgres/V366__Alert_Clock_Drift.sql @@ -0,0 +1,21 @@ +-- Copyright (c) YugaByte, Inc. + + -- Clock skew alert + insert into alert_configuration + (uuid, customer_uuid, name, description, create_time, target_type, target, thresholds, threshold_unit, template, active, default_destination) +select + gen_random_uuid(), + uuid, + 'High clock drift', + 'Local clock on the node has drift too far from the actual time.', + current_timestamp, + 'UNIVERSE', + '{"all":true}', + '{"WARNING":{"condition":"GREATER_THAN","threshold":200.0},"SEVERE":{"condition":"GREATER_THAN", "threshold":400.0}}', + 'MILLISECOND', + 'NODE_CLOCK_DRIFT', + true, + true +from customer; + +select create_universe_alert_definitions('High clock drift'); \ No newline at end of file diff --git a/managed/src/main/resources/health/node_health.py.template b/managed/src/main/resources/health/node_health.py.template index 8f30ff6f0c26..23a51177a814 100755 --- a/managed/src/main/resources/health/node_health.py.template +++ b/managed/src/main/resources/health/node_health.py.template @@ -221,9 +221,9 @@ YB_NODE_CONTROLLER_CHECK = MetricDefinition( "yb_node_controller_check", "YB-Controller server check") YB_NODE_CLOCK_DRIFT_CHECK = MetricDefinition( - "yb_node_clock_drift_check", - "Time Drift betwen nodes within limits" -) + "yb_node_clock_drift_check_ms", + "Time Drift betwen nodes within limits", + "millisec") YB_DDL_ATOMICITY_CHECK = MetricDefinition( "yb_ddl_atomicity_check", "Status of DDL atomicity check" @@ -1914,7 +1914,7 @@ def get_clock_drift_ms(): out = check_output("chronyc tracking", env) match = re.search("System time.*: (.*) second", out, re.MULTILINE) if match: - return int(float(match.group(1))*1000) # Convert seconds to milliseconds + return float(match.group(1))*1000 # Convert seconds to milliseconds return "Error: Unknown time service" def chrony_exists(): @@ -2109,8 +2109,9 @@ def main(): coordinator.add_check(checker, "check_file_descriptors") if n.check_clock: coordinator.add_check(checker, "check_clock_skew") - if n.check_time_drift: - coordinator.add_check(checker, "check_yb_node_clock_drift") + + if n.check_time_drift: + coordinator.add_check(checker, "check_yb_node_clock_drift") if n.master_index >= 0: coordinator.add_check(checker, "check_uptime_for_process", MASTER) diff --git a/managed/src/main/resources/swagger-strict.json b/managed/src/main/resources/swagger-strict.json index 0692ad026f5a..c9e305c2f644 100644 --- a/managed/src/main/resources/swagger-strict.json +++ b/managed/src/main/resources/swagger-strict.json @@ -673,7 +673,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "thresholdUnit" : { @@ -726,7 +726,7 @@ "type" : "string" }, "template" : { - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "uuids" : { @@ -887,7 +887,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "thresholdConditionReadOnly" : { diff --git a/managed/src/main/resources/swagger.json b/managed/src/main/resources/swagger.json index c98d19e3d1dc..a16cabc32a0e 100644 --- a/managed/src/main/resources/swagger.json +++ b/managed/src/main/resources/swagger.json @@ -685,7 +685,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "thresholdUnit" : { @@ -738,7 +738,7 @@ "type" : "string" }, "template" : { - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "uuids" : { @@ -899,7 +899,7 @@ }, "template" : { "description" : "Template name", - "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE" ], + "enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT" ], "type" : "string" }, "thresholdConditionReadOnly" : { diff --git a/managed/src/test/java/com/yugabyte/yw/common/ReleasesUtilsTest.java b/managed/src/test/java/com/yugabyte/yw/common/ReleasesUtilsTest.java index 1d2cb14bdb2e..988a679dd842 100644 --- a/managed/src/test/java/com/yugabyte/yw/common/ReleasesUtilsTest.java +++ b/managed/src/test/java/com/yugabyte/yw/common/ReleasesUtilsTest.java @@ -81,6 +81,8 @@ public void testValidateVersionAgainstCurrentYBA() { when(configHelper.getConfig(ConfigHelper.ConfigType.SoftwareVersion)) .thenReturn(getVersionMap("2024.1.0.0-b23")); when(confGetter.getGlobalConf(GlobalConfKeys.skipVersionChecks)).thenReturn(false); + when(confGetter.getGlobalConf(GlobalConfKeys.allowDbVersionMoreThanYbaVersion)) + .thenReturn(false); // Should pass releasesUtils.validateVersionAgainstCurrentYBA("2024.1.0.0-b23"); diff --git a/managed/ui/src/components/configRedesign/providerRedesign/components/linuxVersionCatalog/UpgradeLinuxVersionModal.tsx b/managed/ui/src/components/configRedesign/providerRedesign/components/linuxVersionCatalog/UpgradeLinuxVersionModal.tsx index 5a9a2a620394..afe811c7e340 100644 --- a/managed/ui/src/components/configRedesign/providerRedesign/components/linuxVersionCatalog/UpgradeLinuxVersionModal.tsx +++ b/managed/ui/src/components/configRedesign/providerRedesign/components/linuxVersionCatalog/UpgradeLinuxVersionModal.tsx @@ -17,7 +17,7 @@ import { find, isEmpty } from 'lodash'; import * as yup from 'yup'; import { useMutation } from 'react-query'; import { Grid, MenuItem, Typography, makeStyles } from '@material-ui/core'; -import { YBInputField, YBModal, YBSelectField } from '../../../../../redesign/components'; +import { AlertVariant, YBAlert, YBInputField, YBModal, YBSelectField } from '../../../../../redesign/components'; import { createErrorMessage } from '../../../../../redesign/features/universe/universe-form/utils/helpers'; import { ClusterType, Universe } from '../../../../../redesign/helpers/dtos'; import { @@ -53,8 +53,11 @@ const useStyles = makeStyles((theme) => ({ targetVersion: { marginTop: '28px' }, + alert: { + marginTop: '50px' + }, info: { - marginTop: '60px', + marginTop: '16px', padding: '15px', borderRadius: '4px', border: `1px solid ${theme.palette.ybacolors.ybBorderGray}`, @@ -170,7 +173,7 @@ export const UpgradeLinuxVersionModal: FC = ({ className: classes.content }} overrideWidth={'886px'} - overrideHeight={'686px'} + overrideHeight={'720px'} cancelLabel={t('cancel', { keyPrefix: 'common' })} submitLabel={t('submitLabel')} buttonProps={{ @@ -254,7 +257,19 @@ export const UpgradeLinuxVersionModal: FC = ({ - + + }} + /> + } + variant={AlertVariant.Warning} + className={classes.alert} + />
here for task details", - "sameVersionErrMsg": "Universe is already using the imageBundle \"{{image_name}}\". Performing OS patching using it will result in no operation." + "sameVersionErrMsg": "Universe is already using the imageBundle \"{{image_name}}\". Performing OS patching using it will result in no operation.", + "verifyImageText": "It is recommended to verify that the selected image can boot successfully on a standalone VM with the same instance type as the universe before initiating the upgrade." }, "deleteModal": { "title": "Delete Linux Version", diff --git a/managed/ui/src/utils/UniverseUtils.js b/managed/ui/src/utils/UniverseUtils.js index a27cd5d3697e..9c2f864552d3 100644 --- a/managed/ui/src/utils/UniverseUtils.js +++ b/managed/ui/src/utils/UniverseUtils.js @@ -47,7 +47,8 @@ export const CONST_VALUES = { SINGLE_QUOTES_SEPARATOR: "'", COMMA_SEPARATOR: ',', EQUALS: '=', - JWKS: 'jwks' + JWKS: 'jwks', + JWT_JWKS_URL: 'jwt_jwks_url' }; export const GFLAG_EDIT = 'EDIT'; @@ -324,7 +325,10 @@ export const unformatConf = (GFlagInput) => { } // Extract jwks content from the row input if it exists - if (GFlagRowConfSubset.includes(CONST_VALUES.JWKS)) { + if ( + GFlagRowConfSubset.includes(CONST_VALUES.JWKS) && + !GFlagRowConfSubset.includes(CONST_VALUES.JWT_JWKS_URL) + ) { const JWKSKey = GFlagRowConfSubset.substring(GFlagRowConfSubset.indexOf(CONST_VALUES.JWKS)); if (isNonEmptyString(JWKSKey)) { GFlagRowConfSubset = GFlagRowConfSubset.replace(JWKSKey, ''); @@ -405,6 +409,7 @@ export const formatConf = (GFlagInput, searchTerm, JWKSToken) => { return initialLDAPConf + appendedLDAPConf + JWKS; } + return GFlagInput; }; @@ -433,16 +438,6 @@ export const verifyAttributes = (GFlagInput, searchTerm, JWKSKeyset, isOIDCSuppo return { isAttributeInvalid, errorMessageKey, isWarning }; } - // Raise error when there is jwt keyword but is no JWKS keyset associated with it - if (searchTerm === CONST_VALUES.JWT && (isEmptyString(JWKSKeyset) || !JWKSKeyset)) { - isAttributeInvalid = true; - isWarning = false; - errorMessageKey = isOIDCSupported - ? 'universeForm.gFlags.uploadKeyset' - : 'universeForm.gFlags.jwksNotSupported'; - return { isAttributeInvalid, errorMessageKey, isWarning }; - } - const keywordLength = searchTerm.length; const isKeywordExist = GFlagInput.includes(searchTerm); @@ -453,6 +448,20 @@ export const verifyAttributes = (GFlagInput, searchTerm, JWKSKeyset, isOIDCSuppo const keywordIndex = GFlagInput.indexOf(keywordList?.[0]); const keywordConf = GFlagInput?.substring(keywordIndex + 1 + keywordLength, GFlagInput.length); const attributes = keywordConf?.match(/(?:[^\s"|""]+|""[^"""]*"|")+/g); + const isJWTUrlExist = attributes?.some((input) => input.includes(CONST_VALUES.JWT_JWKS_URL)); + const isJWKSKesysetEmpty = isEmptyString(JWKSKeyset) || !JWKSKeyset; + + /* + Raise error when there is jwt keyword but is no JWT_JWKS_URL attribute present and Keyset is empty + */ + if (searchTerm === CONST_VALUES.JWT && !isJWTUrlExist && isJWKSKesysetEmpty) { + isAttributeInvalid = true; + isWarning = false; + errorMessageKey = isOIDCSupported + ? 'universeForm.gFlags.uploadKeyset' + : 'universeForm.gFlags.jwksNotSupported'; + return { isAttributeInvalid, errorMessageKey, isWarning }; + } for (let index = 0; index < attributes?.length; index++) { const [attributeKey, ...attributeValues] = attributes[index]?.split(CONST_VALUES.EQUALS); diff --git a/managed/yba-installer/config/templates/yba-installer-postgres.yml b/managed/yba-installer/config/templates/yba-installer-postgres.yml index 25647a4a1b98..44a6ce60c446 100644 --- a/managed/yba-installer/config/templates/yba-installer-postgres.yml +++ b/managed/yba-installer/config/templates/yba-installer-postgres.yml @@ -22,7 +22,7 @@ -w -o "-k {{ .MountPath }}" -l {{ .LogFile }} stop Restart=always - RestartSec={{ yamlPath "postgres.restartSeconds"}} + RestartSec={{ yamlPath "postgres.install.restartSeconds"}} [Install] WantedBy=multi-user.target diff --git a/src/yb/common/doc_hybrid_time.h b/src/yb/common/doc_hybrid_time.h index 9e5b4201114a..0d06eaf4b939 100644 --- a/src/yb/common/doc_hybrid_time.h +++ b/src/yb/common/doc_hybrid_time.h @@ -129,6 +129,10 @@ class DocHybridTime { HybridTime hybrid_time() const { return hybrid_time_; } IntraTxnWriteId write_id() const { return write_id_; } + void IncrementWriteId() { + ++write_id_; + } + // Returns pointer to byte after last used byte. char* EncodedInDocDbFormat(char* dest) const; diff --git a/src/yb/docdb/CMakeLists.txt b/src/yb/docdb/CMakeLists.txt index 7c9b96f4f3e0..b2746bdaa0cf 100644 --- a/src/yb/docdb/CMakeLists.txt +++ b/src/yb/docdb/CMakeLists.txt @@ -78,6 +78,7 @@ set(DOCDB_SRCS transaction_dump.cc transaction_status_cache.cc local_waiting_txn_registry.cc + vector_index_update.cc wait_queue.cc ) @@ -143,6 +144,7 @@ ADD_YB_TEST(shared_lock_manager-test) ADD_YB_TEST(consensus_frontier-test) ADD_YB_TEST(compaction_file_filter-test) ADD_YB_TEST(usearch_vector_index-test) +ADD_YB_TEST(vector_index_docdb-test) if(YB_BUILD_FUZZ_TARGETS) # A library with common code shared between DocDB fuzz tests. diff --git a/src/yb/docdb/docdb_debug.cc b/src/yb/docdb/docdb_debug.cc index 1ff0ffbf850c..4a5551449b9f 100644 --- a/src/yb/docdb/docdb_debug.cc +++ b/src/yb/docdb/docdb_debug.cc @@ -53,15 +53,8 @@ template void ProcessDumpEntry( Slice key, Slice value, SchemaPackingProvider* schema_packing_provider /*null ok*/, StorageDbType db_type, IncludeBinary include_binary, DumpStringFunc func) { - auto [key_str, value_str] = DumpEntryToString(key, value, schema_packing_provider, db_type); - if (!key_str.ok()) { - func(key_str.status().ToString()); - } - if (!value_str.ok()) { - func(value_str.status().CloneAndAppend(". Key: " + *key_str).ToString()); - } else { - func(Format("$0 -> $1", *key_str, *value_str)); - } + auto [key_res, value_res] = DumpEntryToString(key, value, schema_packing_provider, db_type); + func(Format("$0 -> $1", key_res, value_res)); if (include_binary) { func(Format("$0 -> $1\n", FormatSliceAsStr(key), FormatSliceAsStr(value))); } diff --git a/src/yb/docdb/docdb_rocksdb_util.cc b/src/yb/docdb/docdb_rocksdb_util.cc index 9020a009cdf1..208443668f31 100644 --- a/src/yb/docdb/docdb_rocksdb_util.cc +++ b/src/yb/docdb/docdb_rocksdb_util.cc @@ -112,6 +112,12 @@ DEFINE_UNKNOWN_uint64(rocksdb_max_file_size_for_compaction, 0, // db_max_flushing_bytes will be actual default. DEFINE_NON_RUNTIME_int32(rocksdb_max_write_buffer_number, 100500, "Maximum number of write buffers that are built up in memory."); + +DEFINE_RUNTIME_bool( + rocksdb_advise_random_on_open, true, + "If set to true, will hint the underlying file system that the file access pattern is random, " + "when a sst file is opened."); + DECLARE_int64(db_block_size_bytes); DEFINE_UNKNOWN_int64(db_filter_block_size_bytes, 64_KB, @@ -739,6 +745,8 @@ void InitRocksDBOptions( options->max_flushing_bytes = FLAGS_db_max_flushing_bytes; } + options->advise_random_on_open = FLAGS_rocksdb_advise_random_on_open; + options->memtable_factory = std::make_shared( 0 /* lookahead */, rocksdb::ConcurrentWrites::kFalse); diff --git a/src/yb/docdb/vector_index.h b/src/yb/docdb/vector_index.h new file mode 100644 index 000000000000..d826ffe8b026 --- /dev/null +++ b/src/yb/docdb/vector_index.h @@ -0,0 +1,24 @@ +// Copyright (c) YugabyteDB, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// + +#pragma once + +#include "yb/dockv/primitive_value.h" + +namespace yb::docdb { + +using VertexId = uint64_t; +using VectorIndexLevel = uint8_t; +using VectorNodeNeighbors = std::set; + +} // namespace yb::docdb diff --git a/src/yb/docdb/vector_index_docdb-test.cc b/src/yb/docdb/vector_index_docdb-test.cc new file mode 100644 index 000000000000..adb7a4719dad --- /dev/null +++ b/src/yb/docdb/vector_index_docdb-test.cc @@ -0,0 +1,69 @@ +// Copyright (c) YugabyteDB, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// + +#include "yb/docdb/docdb_test_base.h" +#include "yb/docdb/vector_index_update.h" + +#include "yb/util/range.h" + +namespace yb::docdb { + +class VectorIndexDocDBTest : public DocDBTestBase { + Schema CreateSchema() override { + return Schema(); + } +}; + +TEST_F(VectorIndexDocDBTest, Update) { + const HybridTime hybrid_time = HybridTime::FromMicros(1000); + constexpr int kNumNodes = 3; + const auto kNodes = Range(1, kNumNodes + 1); + rocksdb::WriteBatch write_batch; + FloatVectorIndexUpdate update(hybrid_time, write_batch); + for (int i : kNodes) { + update.AddVector(i, {static_cast(M_E * i), static_cast(M_PI * i)}); + } + for (int i : kNodes) { + update.SetNeighbors(i, /* level= */ 0, Range(i + 1, kNumNodes + 1).ToContainer()); + } + for (int i : kNodes) { + update.AddDirectedEdge(i, (i % kNumNodes) + 1, i * 10); + } + + update.DeleteDirectedEdge(2, 3, 20); + update.DeleteVector(3); + + ASSERT_OK(rocksdb()->Write(write_options(), &write_batch)); + + AssertDocDbDebugDumpStrEq(R"#( + // The vector 1 itself. + SubDocKey(DocKey([], [1]), [HT{ physical: 1000 }]) -> [2.71828174591064, 3.14159274101257] + // The neighbors of the vector 1 in level 0. + SubDocKey(DocKey([], [1]), [0; HT{ physical: 1000 w: 3 }]) -> [2, 3] + // The added edge from vector 1 to vector 2 in level 10. + SubDocKey(DocKey([], [1]), [10, 2; HT{ physical: 1000 w: 6 }]) -> null + // The same for remaining vectors. + SubDocKey(DocKey([], [2]), [HT{ physical: 1000 w: 1 }]) -> [5.43656349182129, 6.28318548202515] + SubDocKey(DocKey([], [2]), [0; HT{ physical: 1000 w: 4 }]) -> [3] + // Delete the edge from vector 2 to vector 3 in level 20. + SubDocKey(DocKey([], [2]), [20, 3; HT{ physical: 1000 w: 9 }]) -> DEL + SubDocKey(DocKey([], [2]), [20, 3; HT{ physical: 1000 w: 7 }]) -> null + // Delete the vector 3. + SubDocKey(DocKey([], [3]), [HT{ physical: 1000 w: 10 }]) -> DEL + SubDocKey(DocKey([], [3]), [HT{ physical: 1000 w: 2 }]) -> [8.15484523773193, 9.42477798461914] + SubDocKey(DocKey([], [3]), [0; HT{ physical: 1000 w: 5 }]) -> [] + SubDocKey(DocKey([], [3]), [30, 1; HT{ physical: 1000 w: 8 }]) -> null + )#"); +} + +} // namespace yb::docdb diff --git a/src/yb/docdb/vector_index_update.cc b/src/yb/docdb/vector_index_update.cc new file mode 100644 index 000000000000..397a96cd0891 --- /dev/null +++ b/src/yb/docdb/vector_index_update.cc @@ -0,0 +1,116 @@ +// Copyright (c) YugabyteDB, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// + +#include "yb/docdb/vector_index_update.h" + +#include "yb/dockv/doc_key.h" + +#include "yb/util/decimal.h" + +namespace yb::docdb { + +template +void VectorIndexUpdate::AddVector(VertexId id, IndexedVector vector) { + write_batch_.Put(MakeKey(id).AsSlice(), dockv::PrimitiveValue::Encoded(vector).AsSlice()); + nodes_[id].vector = std::move(vector); +} + +template +void VectorIndexUpdate::DeleteVector(yb::docdb::VertexId id) { + write_batch_.Put(MakeKey(id).AsSlice(), dockv::PrimitiveValue::TombstoneSlice()); + nodes_[id].tombstone = true; +} + +template +void VectorIndexUpdate::SetNeighbors( + VertexId id, VectorIndexLevel level, VectorNodeNeighbors new_neighbors) { + write_batch_.Put( + MakeKey(id, level), + dockv::PrimitiveValue::Encoded( + dockv::UInt64Vector{new_neighbors.begin(), new_neighbors.end()}).AsSlice()); + + GetLevel(id, level).neighbors = std::move(new_neighbors); +} + +template +void VectorIndexUpdate::AddDirectedEdge( + VertexId a, VertexId b, VectorIndexLevel level) { + write_batch_.Put(MakeKey(a, level, b), dockv::PrimitiveValue::NullSlice()); + + auto& vector_info = GetLevel(a, level); + vector_info.neighbors.insert(b); + vector_info.deleted_neighbors.erase(b); +} + +template +void VectorIndexUpdate::DeleteDirectedEdge( + VertexId a, VertexId b, VectorIndexLevel level) { + write_batch_.Put(MakeKey(a, level, b), dockv::PrimitiveValue::TombstoneSlice()); + + auto& vector_info = GetLevel(a, level); + vector_info.neighbors.erase(b); + vector_info.deleted_neighbors.insert(b); +} + +template +auto VectorIndexUpdate::GetLevel(VertexId id, VectorIndexLevel level) -> + VectorIndexUpdate::IndexedVectorLevelInfo& { + auto& node = nodes_[id]; + if (level >= node.levels.size()) { + node.levels.resize(level + 1); + } + return node.levels[level]; +} + +namespace { + +void AppendSubkeys(dockv::KeyBytes& key) { +} + +void AppendSubkey(dockv::KeyBytes& key, VectorIndexLevel level) { + key.AppendKeyEntryType(dockv::KeyEntryType::kUInt32); + key.AppendUInt32(level); +} + +void AppendSubkey(dockv::KeyBytes& key, VertexId id) { + key.AppendKeyEntryType(dockv::KeyEntryType::kUInt64); + key.AppendUInt64(id); +} + +template +void AppendSubkeys(dockv::KeyBytes& key, const T& t, Subkeys&&... subkeys) { + AppendSubkey(key, t); + AppendSubkeys(key, std::forward(subkeys)...); +} + +} // namespace + +template +template +dockv::KeyBytes VectorIndexUpdate::MakeKey(VertexId id, Subkeys&&... subkeys) { + dockv::KeyBytes key; + auto key_entry_value = dockv::KeyEntryValue::VectorVertexId(id); + key_entry_value.AppendToKey(&key); + key.AppendGroupEnd(); + AppendSubkeys(key, std::forward(subkeys)...); + key.AppendKeyEntryType(dockv::KeyEntryType::kHybridTime); + key.AppendHybridTime(doc_ht_); + + doc_ht_.IncrementWriteId(); + + return key; +} + +template class VectorIndexUpdate; + +} // namespace yb::docdb diff --git a/src/yb/docdb/vector_index_update.h b/src/yb/docdb/vector_index_update.h new file mode 100644 index 000000000000..602058820aef --- /dev/null +++ b/src/yb/docdb/vector_index_update.h @@ -0,0 +1,62 @@ +// Copyright (c) YugabyteDB, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// + +#pragma once + +#include +#include + +#include "yb/docdb/vector_index.h" + +#include "yb/rocksdb/write_batch.h" + +namespace yb::docdb { + +template +class VectorIndexUpdate { + public: + using IndexedVector = std::vector; + + explicit VectorIndexUpdate(HybridTime ht, rocksdb::WriteBatch& write_batch) + : doc_ht_(ht), write_batch_(write_batch) {} + + void AddVector(VertexId id, IndexedVector v); + void DeleteVector(VertexId id); + void SetNeighbors(VertexId id, VectorIndexLevel level, VectorNodeNeighbors new_neighbors); + void AddDirectedEdge(VertexId a, VertexId b, VectorIndexLevel level); + void DeleteDirectedEdge(VertexId a, VertexId b, VectorIndexLevel level); + + private: + struct IndexedVectorLevelInfo { + VectorNodeNeighbors neighbors; + VectorNodeNeighbors deleted_neighbors; + }; + + IndexedVectorLevelInfo& GetLevel(VertexId id, VectorIndexLevel level); + template + dockv::KeyBytes MakeKey(VertexId id, Subkeys&&... subkeys); + + struct IndexedVectorInfo { + bool tombstone = false; + IndexedVector vector; + std::vector levels; + }; + + DocHybridTime doc_ht_; + std::unordered_map nodes_; + rocksdb::WriteBatch& write_batch_; +}; + +using FloatVectorIndexUpdate = VectorIndexUpdate; + +} // namespace yb::docdb diff --git a/src/yb/dockv/key_entry_value.h b/src/yb/dockv/key_entry_value.h index b019130e0753..1ee4ab85f728 100644 --- a/src/yb/dockv/key_entry_value.h +++ b/src/yb/dockv/key_entry_value.h @@ -149,6 +149,7 @@ class KeyEntryValue { static KeyEntryValue UInt32(uint32_t v, SortOrder sort_order = SortOrder::kAscending); static KeyEntryValue Int64(int64_t v, SortOrder sort_order = SortOrder::kAscending); static KeyEntryValue UInt64(uint64_t v, SortOrder sort_order = SortOrder::kAscending); + static KeyEntryValue VectorVertexId(uint64_t v); static KeyEntryValue MakeTimestamp( const Timestamp& timestamp, SortOrder sort_order = SortOrder::kAscending); static KeyEntryValue MakeInetAddress( diff --git a/src/yb/dockv/primitive_value.cc b/src/yb/dockv/primitive_value.cc index afe5cfcb2854..65bdab2645be 100644 --- a/src/yb/dockv/primitive_value.cc +++ b/src/yb/dockv/primitive_value.cc @@ -38,6 +38,7 @@ #include "yb/util/bytes_formatter.h" #include "yb/util/compare_util.h" #include "yb/util/decimal.h" +#include "yb/util/endian_util.h" #include "yb/util/fast_varint.h" #include "yb/util/net/inetaddress.h" #include "yb/util/result.h" @@ -60,17 +61,19 @@ using yb::util::DecodeDoubleFromKey; // default clause so that we can ensure that we're handling all possible primitive value types // at compile time. #define IGNORE_NON_PRIMITIVE_VALUE_TYPES_IN_SWITCH \ - case ValueEntryType::kArray: FALLTHROUGH_INTENDED; \ - case ValueEntryType::kInvalid: FALLTHROUGH_INTENDED; \ - case ValueEntryType::kJsonb: FALLTHROUGH_INTENDED; \ - case ValueEntryType::kObject: FALLTHROUGH_INTENDED; \ - case ValueEntryType::kPackedRowV1: FALLTHROUGH_INTENDED; \ - case ValueEntryType::kPackedRowV2: FALLTHROUGH_INTENDED; \ - case ValueEntryType::kRedisList: FALLTHROUGH_INTENDED; \ - case ValueEntryType::kRedisSet: FALLTHROUGH_INTENDED; \ - case ValueEntryType::kRedisSortedSet: FALLTHROUGH_INTENDED; \ - case ValueEntryType::kRedisTS: FALLTHROUGH_INTENDED; \ - case ValueEntryType::kRowLock: FALLTHROUGH_INTENDED; \ + case ValueEntryType::kArray: [[fallthrough]]; \ + case ValueEntryType::kInvalid: [[fallthrough]]; \ + case ValueEntryType::kJsonb: [[fallthrough]]; \ + case ValueEntryType::kObject: [[fallthrough]]; \ + case ValueEntryType::kPackedRowV1: [[fallthrough]]; \ + case ValueEntryType::kPackedRowV2: [[fallthrough]]; \ + case ValueEntryType::kRedisList: [[fallthrough]]; \ + case ValueEntryType::kRedisSet: [[fallthrough]]; \ + case ValueEntryType::kRedisSortedSet: [[fallthrough]]; \ + case ValueEntryType::kRedisTS: [[fallthrough]]; \ + case ValueEntryType::kRowLock: [[fallthrough]]; \ + case ValueEntryType::kFloatVector: [[fallthrough]]; \ + case ValueEntryType::kUInt64Vector: [[fallthrough]]; \ case ValueEntryType::kTombstone: \ break @@ -100,6 +103,8 @@ namespace yb::dockv { namespace { +using VectorEndian = LittleEndian; + bool IsTrue(ValueEntryType type) { return type == ValueEntryType::kTrue; } @@ -213,11 +218,21 @@ inline bool IsCollationEncodedString(Slice val) { return !val.empty() && val[0] == '\0'; } +template +Status CheckNumberOfBytes(size_t found, size_t expected, const Name& name) { + if (found == expected) { + return Status::OK(); + } + return STATUS_FORMAT(Corruption, "Invalid number of bytes to decode $0: $1, need $2", + name, found, expected); +} + } // anonymous namespace const PrimitiveValue PrimitiveValue::kInvalid = PrimitiveValue(ValueEntryType::kInvalid); const PrimitiveValue PrimitiveValue::kTombstone = PrimitiveValue(ValueEntryType::kTombstone); const PrimitiveValue PrimitiveValue::kObject = PrimitiveValue(ValueEntryType::kObject); +const PrimitiveValue PrimitiveValue::kNull = PrimitiveValue(ValueEntryType::kNullLow); const KeyEntryValue KeyEntryValue::kLivenessColumn = KeyEntryValue::SystemColumnId( SystemColumnIds::kLivenessColumn); @@ -311,6 +326,10 @@ std::string PrimitiveValue::ValueToString() const { return Substitute("SubTransactionId($0)", uint32_val_); case ValueEntryType::kWriteId: return Format("WriteId($0)", int32_val_); + case ValueEntryType::kFloatVector: + return AsString(*float_vector_); + case ValueEntryType::kUInt64Vector: + return AsString(*uint64_vector_); case ValueEntryType::kMaxByte: return "0xff"; } @@ -391,6 +410,7 @@ void KeyEntryValue::AppendToKey(KeyBytes* key_bytes) const { return; case KeyEntryType::kUInt64: + case KeyEntryType::kVertexId: key_bytes->AppendUInt64(uint64_val_); return; @@ -953,6 +973,7 @@ Status KeyEntryValue::DecodeKey(Slice* slice, KeyEntryValue* out) { case KeyEntryType::kUInt64Descending: FALLTHROUGH_INTENDED; case KeyEntryType::kUInt64: + case KeyEntryType::kVertexId: if (slice->size() < sizeof(uint64_t)) { return STATUS_SUBSTITUTE(Corruption, "Not enough bytes to decode a 64-bit integer: $0", @@ -1215,10 +1236,7 @@ Status PrimitiveValue::DecodeFromValue(const Slice& rocksdb_slice) { return Status::OK(); case ValueEntryType::kGinNull: - if (slice.size() != sizeof(uint8_t)) { - return STATUS_FORMAT(Corruption, "Invalid number of bytes for a $0: $1", - value_type, slice.size()); - } + RETURN_NOT_OK(CheckNumberOfBytes(slice.size(), sizeof(uint8_t), value_type)); type_ = value_type; gin_null_val_ = slice.data()[0]; return Status::OK(); @@ -1226,10 +1244,7 @@ Status PrimitiveValue::DecodeFromValue(const Slice& rocksdb_slice) { case ValueEntryType::kInt32: FALLTHROUGH_INTENDED; case ValueEntryType::kWriteId: FALLTHROUGH_INTENDED; case ValueEntryType::kFloat: - if (slice.size() != sizeof(int32_t)) { - return STATUS_FORMAT(Corruption, "Invalid number of bytes for a $0: $1", - value_type, slice.size()); - } + RETURN_NOT_OK(CheckNumberOfBytes(slice.size(), sizeof(int32_t), value_type)); type_ = value_type; int32_val_ = BigEndian::Load32(slice.data()); return Status::OK(); @@ -1337,6 +1352,16 @@ Status PrimitiveValue::DecodeFromValue(const Slice& rocksdb_slice) { return Status::OK(); } + case ValueEntryType::kFloatVector: + return DecodeVector(slice, value_type, float_vector_, [](auto*& input) { + return bit_cast(Read(input)); + }); + + case ValueEntryType::kUInt64Vector: + return DecodeVector(slice, value_type, uint64_vector_, [](auto*& input) { + return Read(input); + }); + case ValueEntryType::kInvalid: [[fallthrough]]; case ValueEntryType::kPackedRowV1: [[fallthrough]]; case ValueEntryType::kPackedRowV2: [[fallthrough]]; @@ -1347,6 +1372,22 @@ Status PrimitiveValue::DecodeFromValue(const Slice& rocksdb_slice) { false, Corruption, "Wrong value type $0 in $1", value_type, rocksdb_slice.ToDebugHexString()); } +template +Status PrimitiveValue::DecodeVector( + Slice slice, ValueEntryType value_type, Vector*& vector, const Reader& reader) { + size_t size = VERIFY_RESULT((CheckedRead(slice))); + RETURN_NOT_OK(CheckNumberOfBytes( + slice.size(), size * sizeof(typename Vector::value_type), value_type)); + type_ = value_type; + vector = new Vector(size); + + auto* input = slice.data(); + for (size_t i = 0; i != size; ++i) { + (*vector)[i] = reader(input); + } + return Status::OK(); +} + Status PrimitiveValue::DecodeToQLValuePB( const Slice& rocksdb_slice, DataType data_type, QLValuePB* ql_value) { @@ -1514,10 +1555,7 @@ Status PrimitiveValue::DecodeToQLValuePB( case ValueEntryType::kTransactionId: FALLTHROUGH_INTENDED; case ValueEntryType::kTableId: FALLTHROUGH_INTENDED; case ValueEntryType::kUuid: { - if (slice.size() != kUuidSize) { - return STATUS_FORMAT(Corruption, "Invalid number of bytes to decode Uuid: $0, need $1", - slice.size(), kUuidSize); - } + RETURN_NOT_OK(CheckNumberOfBytes(slice.size(), kUuidSize, "Uuid")); Uuid uuid = VERIFY_RESULT(Uuid::FromComparable(slice)); if (data_type == DataType::UUID) { QLValue::set_uuid_value(uuid, ql_value); @@ -1551,13 +1589,15 @@ Status PrimitiveValue::DecodeToQLValuePB( break; } - case ValueEntryType::kObject: FALLTHROUGH_INTENDED; - case ValueEntryType::kArray: FALLTHROUGH_INTENDED; - case ValueEntryType::kRedisList: FALLTHROUGH_INTENDED; - case ValueEntryType::kRedisSet: FALLTHROUGH_INTENDED; - case ValueEntryType::kRedisTS: FALLTHROUGH_INTENDED; - case ValueEntryType::kRedisSortedSet: FALLTHROUGH_INTENDED; - case ValueEntryType::kGinNull: + case ValueEntryType::kObject: [[fallthrough]]; + case ValueEntryType::kArray: [[fallthrough]]; + case ValueEntryType::kRedisList: [[fallthrough]]; + case ValueEntryType::kRedisSet: [[fallthrough]]; + case ValueEntryType::kRedisTS: [[fallthrough]]; + case ValueEntryType::kRedisSortedSet: [[fallthrough]]; + case ValueEntryType::kGinNull: [[fallthrough]]; + case ValueEntryType::kFloatVector: [[fallthrough]]; + case ValueEntryType::kUInt64Vector: break; case ValueEntryType::kInvalid: [[fallthrough]]; @@ -1862,6 +1902,10 @@ PrimitiveValue::~PrimitiveValue() { delete inetaddress_val_; } else if (type_ == ValueEntryType::kFrozen) { delete frozen_val_; + } else if (type_ == ValueEntryType::kFloatVector) { + delete float_vector_; + } else if (type_ == ValueEntryType::kUInt64Vector) { + delete uint64_vector_; } // HybridTime does not need its destructor to be called, because it is a simple wrapper over an // unsigned 64-bit integer. @@ -2453,6 +2497,13 @@ KeyEntryValue KeyEntryValue::GinNull(uint8_t v) { return result; } +KeyEntryValue KeyEntryValue::VectorVertexId(uint64_t v) { + KeyEntryValue result; + result.type_ = KeyEntryType::kVertexId; + result.uint64_val_ = v; + return result; +} + KeyEntryValue::~KeyEntryValue() { Destroy(); } @@ -2676,7 +2727,8 @@ std::string KeyEntryValue::ToString(AutoDecodeKeys auto_decode_keys) const { case KeyEntryType::kUInt32: case KeyEntryType::kUInt32Descending: return std::to_string(uint32_val_); - case KeyEntryType::kUInt64: FALLTHROUGH_INTENDED; + case KeyEntryType::kUInt64: [[fallthrough]]; + case KeyEntryType::kVertexId: [[fallthrough]]; case KeyEntryType::kUInt64Descending: return std::to_string(uint64_val_); case KeyEntryType::kInt64Descending: FALLTHROUGH_INTENDED; @@ -2791,6 +2843,7 @@ int KeyEntryValue::CompareTo(const KeyEntryValue& other) const { case KeyEntryType::kUInt64Descending: return CompareUsingLessThan(other.uint64_val_, uint64_val_); case KeyEntryType::kUInt64: + case KeyEntryType::kVertexId: return CompareUsingLessThan(uint64_val_, other.uint64_val_); case KeyEntryType::kInt64: FALLTHROUGH_INTENDED; case KeyEntryType::kArrayIndex: @@ -3011,6 +3064,7 @@ bool operator==(const KeyEntryValue& lhs, const KeyEntryValue& rhs) { case KeyEntryType::kUInt64Descending: FALLTHROUGH_INTENDED; case KeyEntryType::kUInt64: + case KeyEntryType::kVertexId: return lhs.uint64_val_ == rhs.uint64_val_; case KeyEntryType::kInt64Descending: FALLTHROUGH_INTENDED; @@ -3065,4 +3119,40 @@ bool operator==(const KeyEntryValue& lhs, const KeyEntryValue& rhs) { FATAL_INVALID_ENUM_VALUE(KeyEntryType, lhs.type_); } +template +void PrimitiveValue::AppendEncodedVector( + ValueEntryType value_type, const Vector& v, ValueBuffer& buffer, const Writer& writer) { + auto* out = buffer.GrowByAtLeast( + 1 + sizeof(uint32_t) + v.size() * sizeof(typename Vector::value_type)); + *(out++) = static_cast(value_type); + Write(out, narrow_cast(v.size())); + for (auto entry : v) { + writer(out, entry); + } +} + +void PrimitiveValue::AppendEncodedTo(const FloatVector& v, ValueBuffer& buffer) { + AppendEncodedVector( + dockv::ValueEntryType::kFloatVector, v, buffer, [](auto*& out, float entry) { + Write(out, bit_cast(util::CanonicalizeFloat(entry))); + }); +} + +void PrimitiveValue::AppendEncodedTo(const UInt64Vector& v, ValueBuffer& buffer) { + AppendEncodedVector( + dockv::ValueEntryType::kUInt64Vector, v, buffer, [](auto*& out, uint64_t entry) { + Write(out, entry); + }); +} + +Slice PrimitiveValue::NullSlice() { + static const char kBuffer = ValueEntryTypeAsChar::kNullLow; + return Slice(&kBuffer, 1); +} + +Slice PrimitiveValue::TombstoneSlice() { + static const char kBuffer = ValueEntryTypeAsChar::kTombstone; + return Slice(&kBuffer, 1); +} + } // namespace yb::dockv diff --git a/src/yb/dockv/primitive_value.h b/src/yb/dockv/primitive_value.h index d1a8006b54d3..a4a800e5fead 100644 --- a/src/yb/dockv/primitive_value.h +++ b/src/yb/dockv/primitive_value.h @@ -47,11 +47,15 @@ YB_DEFINE_ENUM(ListExtendOrder, (APPEND)(PREPEND_BLOCK)(PREPEND)) // A necessary use of a forward declaration to avoid circular inclusion. class SubDocument; +using FloatVector = std::vector; +using UInt64Vector = std::vector; + class PrimitiveValue { public: static const PrimitiveValue kInvalid; static const PrimitiveValue kTombstone; static const PrimitiveValue kObject; + static const PrimitiveValue kNull; using Type = ValueEntryType; @@ -241,6 +245,19 @@ class PrimitiveValue { write_time_ = write_time; } + static void AppendEncodedTo(const FloatVector& v, ValueBuffer& out); + static void AppendEncodedTo(const UInt64Vector& v, ValueBuffer& out); + + template + static ValueBuffer Encoded(const T& t) { + ValueBuffer value; + AppendEncodedTo(t, value); + return value; + } + + static Slice NullSlice(); + static Slice TombstoneSlice(); + protected: static constexpr int64_t kUninitializedWriteTime = std::numeric_limits::min(); @@ -276,12 +293,21 @@ class PrimitiveValue { // This is used in SubDocument to hold a pointer to a map or a vector. void* complex_data_structure_; uint8_t gin_null_val_; + FloatVector* float_vector_; + UInt64Vector* uint64_vector_; }; private: template static PrimitiveValue DoFromQLValuePB(const PB& value); + template + Status DecodeVector( + Slice slice, ValueEntryType value_type, Vector*& vector, const Reader& reader); + + template + static void AppendEncodedVector( + ValueEntryType value_type, const Vector& v, ValueBuffer& out, const Writer& writer); // This is used in both the move constructor and the move assignment operator. Assumes this object // has not been constructed, or that the destructor has just been called. diff --git a/src/yb/dockv/value_type.h b/src/yb/dockv/value_type.h index 27817d04ae2f..e100f8ab9c4c 100644 --- a/src/yb/dockv/value_type.h +++ b/src/yb/dockv/value_type.h @@ -95,6 +95,7 @@ namespace yb::dockv { ((kString, 'S')) /* ASCII code 83 */ \ ((kTrue, 'T')) /* ASCII code 84 */ \ ((kUInt64, 'U')) /* ASCII code 85 */ \ + ((kVertexId, 'V')) /* ASCII code 86 */ \ ((kExternalIntents, 'Z')) /* ASCII code 90 */ \ ((kArrayIndex, '[')) /* ASCII code 91 */ \ ((kCollString, '\\')) /* ASCII code 92 */ \ @@ -175,6 +176,8 @@ namespace yb::dockv { ((kString, 'S')) /* ASCII code 83 */ \ ((kTrue, 'T')) /* ASCII code 84 */ \ ((kUInt64, 'U')) /* ASCII code 85 */ \ + ((kFloatVector, 'V')) /* ASCII code 86 */ \ + ((kUInt64Vector, 'W')) /* ASCII code 87 */ \ ((kTombstone, 'X')) /* ASCII code 88 */ \ ((kArrayIndex, '[')) /* ASCII code 91 */ \ ((kCollString, '\\')) /* ASCII code 92 */ \ diff --git a/src/yb/integration-tests/cdcsdk_ysql-test.cc b/src/yb/integration-tests/cdcsdk_ysql-test.cc index 62ec0102c0e8..ea68c0239636 100644 --- a/src/yb/integration-tests/cdcsdk_ysql-test.cc +++ b/src/yb/integration-tests/cdcsdk_ysql-test.cc @@ -9289,5 +9289,61 @@ TEST_F(CDCSDKYsqlTest, TestChildTabletsOfNonEligibleTableDoNotGetAddedToConsiste /* use_consistent_snapshot_stream */ true); } +TEST_F( + CDCSDKYsqlTest, + YB_DISABLE_TEST_IN_TSAN(TestNonEligibleTablesCleanupWhenDropTableCleanupIsDisabled)) { + ANNOTATE_UNPROTECTED_WRITE(FLAGS_cdcsdk_enable_cleanup_of_non_eligible_tables_from_stream) = true; + ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_cdcsdk_disable_drop_table_cleanup) = true; + // Setup cluster. + ASSERT_OK(SetUpWithParams(3, 3, false)); + const vector table_list_suffix = {"_1", "_2", "_3"}; + const int kNumTables = 3; + vector table(kNumTables); + int idx = 0; + vector> tablets(kNumTables); + + while (idx < 3) { + table[idx] = ASSERT_RESULT(CreateTable( + &test_cluster_, kNamespaceName, kTableName, 1, true, false, 0, true, + table_list_suffix[idx])); + ASSERT_OK(test_client()->GetTablets( + table[idx], 0, &tablets[idx], /* partition_list_version = */ nullptr)); + ASSERT_OK(WriteEnumsRows( + 0 /* start */, 100 /* end */, &test_cluster_, table_list_suffix[idx], kNamespaceName, + kTableName)); + idx += 1; + } + + auto stream_id = ASSERT_RESULT(CreateConsistentSnapshotStream()); + std::unordered_set expected_table_ids = { + table[0].table_id(), table[1].table_id(), table[2].table_id()}; + VerifyTablesInStreamMetadata(stream_id, expected_table_ids, "Waiting for stream metadata."); + + LOG(INFO) << "Dropping table: " << Format("$0$1", kTableName, table_list_suffix[0]); + DropTable(&test_cluster_, Format("$0$1", kTableName, table_list_suffix[0]).c_str()); + // Stream metadata wouldnt be cleaned up since the codepath is disabled via + // 'TEST_cdcsdk_disable_drop_table_cleanup' flag. Therefore all 3 tables are expected to be + // present in stream metadata. + SleepFor(MonoDelta::FromSeconds(3)); + VerifyTablesInStreamMetadata( + stream_id, expected_table_ids, "Waiting for stream metadata after drop table."); + + // On loading of CDC stream after a master leader restart, presence of non-eligible tables in CDC + // stream will be checked. + auto leader_master = ASSERT_RESULT(test_cluster_.mini_cluster_->GetLeaderMiniMaster()); + ASSERT_OK(leader_master->Restart()); + LOG(INFO) << "Master Restarted"; + SleepFor(MonoDelta::FromSeconds(5)); + + // Enable bg threads to cleanup CDC stream metadata for dropped tables. + ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_cdcsdk_disable_drop_table_cleanup) = false; + + // Verify the dropped table has been removed from stream metadata after enabling the cleanup. + expected_table_ids.erase(table[0].table_id()); + VerifyTablesInStreamMetadata( + stream_id, expected_table_ids, + "Waiting for GetDBStreamInfo post metadata cleanup after restart."); +} + } // namespace cdc } // namespace yb diff --git a/src/yb/integration-tests/cdcsdk_ysql_test_base.h b/src/yb/integration-tests/cdcsdk_ysql_test_base.h index 7dd8f36c0ec2..9bebbf3e971a 100644 --- a/src/yb/integration-tests/cdcsdk_ysql_test_base.h +++ b/src/yb/integration-tests/cdcsdk_ysql_test_base.h @@ -122,6 +122,8 @@ DECLARE_bool(TEST_cdcsdk_skip_updating_cdc_state_entries_on_table_removal); DECLARE_bool(TEST_cdcsdk_add_indexes_to_stream); DECLARE_bool(cdcsdk_enable_cleanup_of_non_eligible_tables_from_stream); DECLARE_bool(TEST_cdcsdk_skip_stream_active_check); +DECLARE_bool(TEST_cdcsdk_disable_drop_table_cleanup); + namespace yb { using client::YBClient; diff --git a/src/yb/integration-tests/tablet-split-itest.cc b/src/yb/integration-tests/tablet-split-itest.cc index 9f9ebc80c778..939501456758 100644 --- a/src/yb/integration-tests/tablet-split-itest.cc +++ b/src/yb/integration-tests/tablet-split-itest.cc @@ -3617,9 +3617,8 @@ class TabletSplitSingleBlockITest : return STATUS(Incomplete, "Empty or too small SST."); } - auto data_block = VERIFY_RESULT(table_reader->RetrieveBlockFromFile( - rocksdb::ReadOptions::kDefault, index_iter->value(), rocksdb::BlockType::kData)); - return data_block->NumRestarts(); + return table_reader->TEST_GetBlockNumRestarts( + rocksdb::ReadOptions::kDefault, index_iter->value(), rocksdb::BlockType::kData); } }; diff --git a/src/yb/master/xrepl_catalog_manager.cc b/src/yb/master/xrepl_catalog_manager.cc index 981430b86e2d..838e346423cf 100644 --- a/src/yb/master/xrepl_catalog_manager.cc +++ b/src/yb/master/xrepl_catalog_manager.cc @@ -120,6 +120,9 @@ DEFINE_RUNTIME_bool(cdcsdk_enable_cleanup_of_non_eligible_tables_from_stream, fa "materialised view etc. in their stream metadata and these tables will be marked for removal " "by catalog manager background thread."); +DEFINE_test_flag(bool, cdcsdk_disable_drop_table_cleanup, false, + "When enabled, cleanup of dropped tables from CDC streams will be skipped."); + DEFINE_RUNTIME_AUTO_bool(cdcsdk_enable_identification_of_non_eligible_tables, kLocalPersisted, false, @@ -1809,27 +1812,32 @@ void CatalogManager::FindAllNonEligibleTablesInCDCSDKStream( for (const auto& table_id : table_ids) { if (!user_table_ids.contains(table_id)) { auto table_info = GetTableInfoUnlocked(table_id); - Schema schema; - Status status = table_info->GetSchema(&schema); - if (!status.ok()) { - LOG_WITH_FUNC(WARNING) << "Error while getting schema for table: " << table_info->name(); - // Skip this table for now, it will be revisited for removal on master restart/master leader - // change. - continue; - } + if (table_info) { + Schema schema; + Status status = table_info->GetSchema(&schema); + if (!status.ok()) { + LOG_WITH_FUNC(WARNING) << "Error while getting schema for table: " << table_info->name(); + // Skip this table for now, it will be revisited for removal on master restart/master + // leader change. + continue; + } - // Re-confirm this table is not meant to be part of a CDC stream. - if (!IsTableEligibleForCDCSDKStream(table_info, schema)) { - LOG(INFO) << "Found a non-eligible table: " << table_info->id() - << ", for stream: " << stream_id; - LockGuard lock(cdcsdk_non_eligible_table_mutex_); - namespace_to_cdcsdk_non_eligible_table_map_[table_info->namespace_id()].insert( - table_info->id()); + // Re-confirm this table is not meant to be part of a CDC stream. + if (!IsTableEligibleForCDCSDKStream(table_info, schema)) { + LOG(INFO) << "Found a non-eligible table: " << table_info->id() + << ", for stream: " << stream_id; + LockGuard lock(cdcsdk_non_eligible_table_mutex_); + namespace_to_cdcsdk_non_eligible_table_map_[table_info->namespace_id()].insert( + table_info->id()); + } else { + // Ideally we are not expected to enter the else clause. + LOG(WARNING) << "Found table " << table_id << " in metadata of stream " << stream_id + << " that is not present in the eligible list of tables " + "from the namespace for CDC"; + } } else { - // Ideally we are not expected to enter the else clause. - LOG(WARNING) << "Found table " << table_id << " in metadata of stream " << stream_id - << " that is not present in the eligible list of tables " - "from the namespace for CDC"; + LOG(INFO) << "Found table " << table_id << " in stream " << stream_id + << " metadata that is not present in master."; } } } @@ -4088,8 +4096,9 @@ void CatalogManager::RunXReplBgTasks(const LeaderEpoch& epoch) { // Clean up Failed Replication Bootstrap on the Consumer. WARN_NOT_OK(ClearFailedReplicationBootstrap(), "Failed Clearing Failed Replication Bootstrap"); - WARN_NOT_OK( - CleanUpCDCSDKStreamsMetadata(epoch), "Failed Cleanup CDCSDK Streams Metadata"); + if (!FLAGS_TEST_cdcsdk_disable_drop_table_cleanup) { + WARN_NOT_OK(CleanUpCDCSDKStreamsMetadata(epoch), "Failed Cleanup CDCSDK Streams Metadata"); + } // Restart xCluster and CDCSDK parent tablet deletion bg task. StartXReplParentTabletDeletionTaskIfStopped(); diff --git a/src/yb/rocksdb/CMakeLists.txt b/src/yb/rocksdb/CMakeLists.txt index 17fde91eb78f..f6364848ea9c 100644 --- a/src/yb/rocksdb/CMakeLists.txt +++ b/src/yb/rocksdb/CMakeLists.txt @@ -235,6 +235,7 @@ ADD_YB_TEST(db/merge_test) ADD_YB_TEST(db/options_file_test) ADD_YB_TEST(db/perf_context_test) ADD_YB_TEST(db/prefix_test) +ADD_YB_TEST(db/readahead_test) ADD_YB_TEST(db/skiplist_test) ADD_YB_TEST(db/table_properties_collector_test) ADD_YB_TEST(db/user_op_id_test) diff --git a/src/yb/rocksdb/db/readahead_test.cc b/src/yb/rocksdb/db/readahead_test.cc new file mode 100644 index 000000000000..8b2d43192a43 --- /dev/null +++ b/src/yb/rocksdb/db/readahead_test.cc @@ -0,0 +1,511 @@ +// Copyright (c) YugaByte, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// + +#include "yb/rocksdb/db/db_test_util.h" + +#include "yb/rocksdb/env.h" + +#include "yb/rocksutil/yb_rocksdb_logger.h" + +#include "yb/util/compare_util.h" +#include "yb/util/size_literals.h" + +DECLARE_uint64(rocksdb_iterator_sequential_disk_reads_for_auto_readahead); +DECLARE_uint64(rocksdb_iterator_init_readahead_size); +DECLARE_uint64(rocksdb_iterator_max_readahead_size); +DECLARE_bool(TEST_rocksdb_record_readahead_stats_only_for_data_blocks); + +using namespace std::literals; + +namespace rocksdb { + +struct ReadaheadStats { + void AddReadaheadCall(size_t bytes_read) { + ++readahead_calls; + readahead_bytes_read += bytes_read; + } + + void IncreaseWindowAndAddReadaheadCall(size_t* readahead_size) { + *readahead_size = + std::min(*readahead_size * 2, FLAGS_rocksdb_iterator_max_readahead_size); + ++readahead_calls; + readahead_bytes_read += *readahead_size; + } + + void AddReadaheadReset(size_t* readahead_size) { + LOG(INFO) << "Expecting reset of readahead"; + *readahead_size = FLAGS_rocksdb_iterator_init_readahead_size; + ++readahead_reset; + } + + std::string ToString() const { + return YB_STRUCT_TO_STRING(readahead_calls, readahead_bytes_read, readahead_reset); + } + + uint64_t readahead_calls = 0; + uint64_t readahead_bytes_read = 0; + uint64_t readahead_reset = 0; +}; + +inline bool operator==(const ReadaheadStats& lhs, const ReadaheadStats& rhs) { + return YB_STRUCT_EQUALS(readahead_calls, readahead_bytes_read, readahead_reset); +} + +class TestRandomAccessFile; + +class TestEnv : public EnvWrapper { + public: + explicit TestEnv(Env* target) : EnvWrapper(target) {} + + Status NewRandomAccessFile( + const std::string& f, std::unique_ptr* r, + const EnvOptions& soptions) override; + + void OnRandomAccessFileDestroy(TestRandomAccessFile* file); + + TestRandomAccessFile* GetRandomAccessFile(const std::string& filename) { + std::lock_guard l(mutex_); + auto it = random_access_files_.find(filename); + if (it == random_access_files_.end()) { + LOG(ERROR) << "Filenames in TestEnv:"; + for (auto file : random_access_files_) { + LOG(ERROR) << file.first; + } + LOG(FATAL) << "No '" << filename << "' found in TestEnv"; + } + return it->second; + } + + private: + std::mutex mutex_; + std::unordered_map random_access_files_; +}; + +class TestRandomAccessFile : public yb::RandomAccessFileWrapper { + public: + TestRandomAccessFile(std::unique_ptr target, TestEnv* env) + : RandomAccessFileWrapper(std::move(target)), env_(env) {} + + ~TestRandomAccessFile() { + env_->OnRandomAccessFileDestroy(this); + } + + Status Read(uint64_t offset, size_t n, Slice* result, uint8_t* scratch) const override { + auto s = target()->Read(offset, n, result, scratch); + last_read_offset.store(offset); + last_read_length.store(result->size()); + read_count.fetch_add(1); + LOG(INFO) << "Disk read at: " << offset << " n: " << n << " filename: " << target()->filename(); + return s; + } + + void Readahead(size_t offset, size_t length) override { + target()->Readahead(offset, length); + last_readahead_offset.store(offset); + last_readahead_length.store(length); + } + + size_t GetReadaheadLimit() const { + return last_readahead_offset + last_readahead_length; + } + + size_t GetLastReadEnd() const { + return last_read_offset + last_read_length; + } + + TestEnv* env_; + mutable std::atomic read_count; + mutable std::atomic last_read_offset; + mutable std::atomic last_read_length; + mutable std::atomic last_readahead_offset; + mutable std::atomic last_readahead_length; +}; + +Status TestEnv::NewRandomAccessFile( + const std::string& f, std::unique_ptr* r, + const EnvOptions& soptions) { + RETURN_NOT_OK(target()->NewRandomAccessFile(f, r, soptions)); + auto* file = new TestRandomAccessFile(std::move(*r), this); + r->reset(file); + std::lock_guard l(mutex_); + random_access_files_[file->filename()] = file; + return Status::OK(); +} + +void TestEnv::OnRandomAccessFileDestroy(TestRandomAccessFile* file) { + std::lock_guard l(mutex_); + random_access_files_.erase(file->filename()); +} + +class ReadaheadTest : public DBTestBase { + public: + ReadaheadTest() : + DBTestBase("/readahead_test"), rnd_(301) { + FLAGS_TEST_rocksdb_record_readahead_stats_only_for_data_blocks = true; + FLAGS_rocksdb_iterator_init_readahead_size = kBlockSize * 4; + FLAGS_rocksdb_iterator_max_readahead_size = kBlockSize * 32; + + num_keys_ = static_cast(FLAGS_rocksdb_iterator_max_readahead_size * 10 / kValueSize); + + test_env_ = std::make_unique(env_); + + BlockBasedTableOptions table_options; + table_options.block_size = kBlockSize; + table_options.index_type = IndexType::kMultiLevelBinarySearch; + table_options.index_block_size = kBlockSize; + table_factory_.reset(new BlockBasedTableFactory(table_options)); + } + + ~ReadaheadTest() { + // Close DB so we can destroy of test_env_. + Close(); + } + + Options CurrentOptions() { + Options options = DBTestBase::CurrentOptions(); + options.env = test_env_.get(); + options.compaction_style = kCompactionStyleNone; + options.num_levels = 1; + options.statistics = rocksdb::CreateDBStatisticsForTests(); + options.info_log = std::make_shared(options.log_prefix); + // Large enough to prevent auto-flush. + options.write_buffer_size = 1024_MB; + + options.table_factory = table_factory_; + + return options; + } + + Status WriteData() { + for (int k = 0; k < num_keys_; ++k) { + RETURN_NOT_OK(Put(Key(k), RandomString(&rnd_, kValueSize))); + } + RETURN_NOT_OK(Flush()); + + auto live_files_meta = db_->GetLiveFilesMetaData(); + SCHECK_EQ(live_files_meta.size(), 1, InternalError, "Expected single SST file"); + sst_metadata_ = live_files_meta.front(); + + TablePropertiesCollection props; + RETURN_NOT_OK(db_->GetPropertiesOfAllTables(&props)); + SCHECK_EQ(props.size(), 1, InternalError, "Expected single SST file"); + sst_props_ = *props.begin()->second; + + avg_compressed_data_block_size_ = sst_props_.data_size / sst_props_.num_data_blocks; + LOG(INFO) << "avg_compressed_data_block_size: " << avg_compressed_data_block_size_; + avg_keys_per_block_ = num_keys_ / sst_props_.num_data_blocks; + + return Status::OK(); + } + + void PurgeBlockCache() { + auto* block_cache = table_factory_->table_options().block_cache.get(); + auto capacity = block_cache->GetCapacity(); + block_cache->SetCapacity(0); + block_cache->SetCapacity(capacity); + LOG(INFO) << "Purged block cache"; + } + + ReadaheadStats GetReadaheadStats() { + auto* stats = last_options_.statistics.get(); + return ReadaheadStats { + .readahead_calls = stats->getTickerCount(Tickers::READAHEAD_CALLS), + .readahead_bytes_read = stats->getTickerCount(Tickers::READAHEAD_BYTES_READ), + .readahead_reset = stats->getTickerCount(Tickers::READAHEAD_RESET), + }; + } + + Status ExpectReadaheadStats(const ReadaheadStats& expected) { + SCHECK_EQ(GetReadaheadStats(), expected, InternalError, "Unexpected stats"); + LOG(INFO) << "Readahead stats: " << expected.ToString(); + return Status::OK(); + } + + Status SeekToKey(const std::unique_ptr& iter, int key_idx) { + LOG(INFO) << "Seeking to key_idx: " << key_idx; + iter->Seek(Key(key_idx)); + SCHECK(VERIFY_RESULT(iter->CheckedValid()), InternalError, "Iterator is not valid"); + return Status::OK(); + } + + // Returns true if reached end of readahead window and false if reached end of data. + Result ReadOneKeyPerBlockUntilOutOfReadaheadWindow( + const std::unique_ptr& iter, TestRandomAccessFile* data_file, + int* current_key_idx) { + auto stats = GetReadaheadStats(); + auto readahead_limit = data_file->GetReadaheadLimit(); + for(;;) { + if (data_file->GetLastReadEnd() > readahead_limit) { + break; + } + RETURN_NOT_OK(ExpectReadaheadStats(stats)); + *current_key_idx += avg_keys_per_block_; + if (*current_key_idx >= num_keys_) { + return false; + } + RETURN_NOT_OK(SeekToKey(iter, *current_key_idx)); + } + return true; + } + + static constexpr auto kBlockSize = 8_KB; + static constexpr auto kValueSize = kBlockSize / 16; + + int num_keys_; + + Random rnd_; + std::unique_ptr test_env_; + std::shared_ptr table_factory_; + + std::optional sst_metadata_; + TableProperties sst_props_; + size_t avg_compressed_data_block_size_; + int avg_keys_per_block_; +}; + +namespace { + +void AddWholeFileReadaheads(size_t file_size, size_t* readahead_calls, size_t* readahead_bytes) { + size_t readahead_size = FLAGS_rocksdb_iterator_init_readahead_size; + for (;;) { + ++*readahead_calls; + *readahead_bytes += readahead_size; + if (file_size <= readahead_size) { + break; + } + file_size -= readahead_size; + readahead_size = + std::min(readahead_size * 2, FLAGS_rocksdb_iterator_max_readahead_size); + } +} + +} // namespace + +TEST_F(ReadaheadTest, SequentialScan) { + Options options = CurrentOptions(); + Reopen(options); + + ASSERT_OK(WriteData()); + + for (auto seq_disk_reads_for_readahead : {0, 1, 2, 3, 4, 5, 8, 16}) { + LOG(INFO) << "Setting FLAGS_rocksdb_iterator_sequential_disk_reads_for_auto_readahead = " + << seq_disk_reads_for_readahead; + FLAGS_rocksdb_iterator_sequential_disk_reads_for_auto_readahead = seq_disk_reads_for_readahead; + for (bool purge_block_cache : {true, false}) { + if (purge_block_cache) { + PurgeBlockCache(); + } + + auto* stats = options.statistics.get(); + stats->resetTickersForTest(); + + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + size_t num_keys_read = 0; + for (iter->SeekToFirst(); ASSERT_RESULT(iter->CheckedValid()); + iter->Next(), ++num_keys_read) { + if ((seq_disk_reads_for_readahead > 1) && + (num_keys_read == + (seq_disk_reads_for_readahead - 1) * num_keys_ / sst_props_.num_data_blocks)) { + LOG(INFO) << "num_keys: " << num_keys_ + << " num_data_blocks: " << sst_props_.num_data_blocks + << " num_keys_read: " << num_keys_read; + // We are about to reach seq_disk_reads_for_readahead disk reads. Should be no readaheads + // till now. + ASSERT_OK(ExpectReadaheadStats(ReadaheadStats())); + } + } + + size_t expected_num_readaheads = 0; + size_t expected_readahead_bytes_read = 0; + if (seq_disk_reads_for_readahead > 0 && purge_block_cache) { + const auto bytes_should_read_before_readahead = + (seq_disk_reads_for_readahead - 1) * avg_compressed_data_block_size_; + const auto data_size = sst_props_.data_size; + if (data_size > bytes_should_read_before_readahead) { + AddWholeFileReadaheads( + data_size - bytes_should_read_before_readahead, &expected_num_readaheads, + &expected_readahead_bytes_read); + } + LOG(INFO) << " data_size: " << data_size + << " bytes_should_read_before_readahead: " << bytes_should_read_before_readahead; + } + + const auto num_readaheads = stats->getTickerCount(Tickers::READAHEAD_CALLS); + const auto readahead_bytes_read = stats->getTickerCount(Tickers::READAHEAD_BYTES_READ); + + ASSERT_GE(num_readaheads, expected_num_readaheads); + ASSERT_GE(readahead_bytes_read, expected_readahead_bytes_read); + + // We can readahead more in reality due to blocks located on readahead window boundary. + ASSERT_LE(num_readaheads, expected_num_readaheads * 1.1); + ASSERT_LE( + readahead_bytes_read, + expected_readahead_bytes_read + (num_readaheads - expected_num_readaheads) * + FLAGS_rocksdb_iterator_max_readahead_size); + + ASSERT_EQ(stats->getTickerCount(Tickers::READAHEAD_RESET), 0); + } + } +} + +TEST_F(ReadaheadTest, MixedReadsWith1SeqDiskReadsForReadahead) { + FLAGS_rocksdb_iterator_sequential_disk_reads_for_auto_readahead = 1; + + Options options = CurrentOptions(); + Reopen(options); + + ASSERT_OK(WriteData()); + + TestRandomAccessFile* data_file = test_env_->GetRandomAccessFile(sst_metadata_->DataFilePath()); + + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + + size_t expected_readahead_size = FLAGS_rocksdb_iterator_init_readahead_size; + ReadaheadStats expected_stats; + + int current_key_idx = 0; + ASSERT_OK(SeekToKey(iter, current_key_idx)); + + expected_stats.AddReadaheadCall(expected_readahead_size); + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + + ASSERT_OK(ReadOneKeyPerBlockUntilOutOfReadaheadWindow(iter, data_file, ¤t_key_idx)); + + expected_stats.IncreaseWindowAndAddReadaheadCall(&expected_readahead_size); + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + + ASSERT_OK(ReadOneKeyPerBlockUntilOutOfReadaheadWindow(iter, data_file, ¤t_key_idx)); + + expected_stats.IncreaseWindowAndAddReadaheadCall(&expected_readahead_size); + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + + constexpr auto kBlocksToJump = 4; + + // Jump forward. + current_key_idx += kBlocksToJump * avg_keys_per_block_; + ASSERT_OK(SeekToKey(iter, current_key_idx)); + + expected_stats.AddReadaheadReset(&expected_readahead_size); + expected_stats.AddReadaheadCall(expected_readahead_size); + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + + ASSERT_OK(ReadOneKeyPerBlockUntilOutOfReadaheadWindow(iter, data_file, ¤t_key_idx)); + expected_stats.IncreaseWindowAndAddReadaheadCall(&expected_readahead_size); + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + + // Jump backward. + current_key_idx -= kBlocksToJump * avg_keys_per_block_; + ASSERT_OK(SeekToKey(iter, current_key_idx)); + + // No disk reads, served from block cache but still should reset readahead. + expected_stats.AddReadaheadReset(&expected_readahead_size); + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + + // Read next blocks, served from block cache, no disk reads => no readahead. + for (int i = 0; i < kBlocksToJump; ++i) { + current_key_idx += avg_keys_per_block_; + ASSERT_OK(SeekToKey(iter, current_key_idx)); + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + } + + PurgeBlockCache(); + + // Read next block after purging block cache, should do readahead. + current_key_idx += avg_keys_per_block_; + ASSERT_OK(SeekToKey(iter, current_key_idx)); + expected_stats.AddReadaheadCall(expected_readahead_size); + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + + ASSERT_OK(ReadOneKeyPerBlockUntilOutOfReadaheadWindow(iter, data_file, ¤t_key_idx)); + expected_stats.IncreaseWindowAndAddReadaheadCall(&expected_readahead_size); + ASSERT_OK(ExpectReadaheadStats(expected_stats)); +} + +TEST_F(ReadaheadTest, MixedReads) { + constexpr auto kNumRandomSeeks = 200; + + Options options = CurrentOptions(); + Reopen(options); + + ASSERT_OK(WriteData()); + + TestRandomAccessFile* data_file = test_env_->GetRandomAccessFile(sst_metadata_->DataFilePath()); + + for (auto seq_disk_reads_for_readahead : {2, 3, 4, 5, 8, 16}) { + LOG(INFO) << "Setting FLAGS_rocksdb_iterator_sequential_disk_reads_for_auto_readahead = " + << seq_disk_reads_for_readahead; + FLAGS_rocksdb_iterator_sequential_disk_reads_for_auto_readahead = seq_disk_reads_for_readahead; + PurgeBlockCache(); + + auto* stats = options.statistics.get(); + stats->resetTickersForTest(); + + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + + size_t expected_readahead_size = FLAGS_rocksdb_iterator_init_readahead_size; + ReadaheadStats expected_stats; + + int current_key_idx = -1; + auto prev_num_disk_reads = data_file->read_count.load(); + for (int random_seek_iter = 0; random_seek_iter < kNumRandomSeeks; ++random_seek_iter) { + // Seek to another random block (and not the next one). + const auto prev_key_idx = current_key_idx; + do { + current_key_idx = rnd_.Uniform(num_keys_); + } while (prev_key_idx >= 0 && current_key_idx >= prev_key_idx - avg_keys_per_block_ && + current_key_idx <= prev_key_idx + 2 * avg_keys_per_block_); + + auto num_disk_reads = data_file->read_count.load(); + if (num_disk_reads > prev_num_disk_reads) { + expected_stats.AddReadaheadReset(&expected_readahead_size); + prev_num_disk_reads = num_disk_reads; + } + LOG(INFO) << "Disk read count: " << num_disk_reads + << ". Moving to random key: " << current_key_idx; + + for (; current_key_idx < num_keys_; + current_key_idx += avg_keys_per_block_) { + ASSERT_OK(SeekToKey(iter, current_key_idx)); + LOG(INFO) << "Disk read count: " << data_file->read_count; + + // No readahead until Nth seq disk reads. + if (data_file->read_count == num_disk_reads + seq_disk_reads_for_readahead) { + expected_stats.AddReadaheadCall(expected_readahead_size); + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + break; + } + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + } + + for (int readahead_window_iter = 0; readahead_window_iter < 2; ++readahead_window_iter) { + num_disk_reads = data_file->read_count.load(); + if (!ASSERT_RESULT( + ReadOneKeyPerBlockUntilOutOfReadaheadWindow(iter, data_file, ¤t_key_idx))) { + continue; + } + if (data_file->read_count > num_disk_reads) { + expected_stats.IncreaseWindowAndAddReadaheadCall(&expected_readahead_size); + } + ASSERT_OK(ExpectReadaheadStats(expected_stats)); + } + } + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + google::ParseCommandLineFlags(&argc, &argv, /* remove_flags = */ true); + return RUN_ALL_TESTS(); +} diff --git a/src/yb/rocksdb/statistics.h b/src/yb/rocksdb/statistics.h index c3627b67eb30..e6125814a13a 100644 --- a/src/yb/rocksdb/statistics.h +++ b/src/yb/rocksdb/statistics.h @@ -218,6 +218,10 @@ enum Tickers : uint32_t { COMPACTION_FILES_FILTERED, COMPACTION_FILES_NOT_FILTERED, + READAHEAD_RESET, + READAHEAD_CALLS, + READAHEAD_BYTES_READ, + // End of ticker enum. TICKER_ENUM_MAX, }; diff --git a/src/yb/rocksdb/table/block_based_table_reader.cc b/src/yb/rocksdb/table/block_based_table_reader.cc index 562c328cab9e..e982508f02f8 100644 --- a/src/yb/rocksdb/table/block_based_table_reader.cc +++ b/src/yb/rocksdb/table/block_based_table_reader.cc @@ -65,9 +65,32 @@ #include "yb/util/logging.h" #include "yb/util/mem_tracker.h" #include "yb/util/scope_exit.h" +#include "yb/util/size_literals.h" #include "yb/util/stats/perf_step_timer.h" #include "yb/util/status_format.h" #include "yb/util/string_util.h" +#include "yb/util/tostring.h" + +using yb::operator"" _KB; + +DEFINE_RUNTIME_uint64(rocksdb_iterator_sequential_disk_reads_for_auto_readahead, 4, + "Enable readahead when RocksDB iterator attempts to perform a configured number of sequential " + "disk reads. If set to 0, the iterator readahead is disabled. If set to 1, iterator readahead " + "will be enabled with the first disk read. If set to N > 1, iterator readahead will be used " + "with the Nth sequential disk read."); + +TAG_FLAG(rocksdb_iterator_sequential_disk_reads_for_auto_readahead, advanced); + +DEFINE_RUNTIME_uint64(rocksdb_iterator_init_readahead_size, 32_KB, + "Initial RocksDB iterator readahead size."); +TAG_FLAG(rocksdb_iterator_init_readahead_size, advanced); + +DEFINE_RUNTIME_uint64(rocksdb_iterator_max_readahead_size, 2_MB, + "Maximum RocksDB iterator readahead size."); +TAG_FLAG(rocksdb_iterator_max_readahead_size, advanced); + +DEFINE_test_flag(bool, rocksdb_record_readahead_stats_only_for_data_blocks, false, + "For testing only. Record readahead statistics only for data blocks."); namespace rocksdb { @@ -130,6 +153,19 @@ class NotMatchingFilterBlockReader : public FilterBlockReader { virtual size_t ApproximateMemoryUsage() const override { return 0; } }; +InternalIterator* ReturnErrorIterator(const Status& status, BlockIter* input_iter) { + if (input_iter != nullptr) { + input_iter->SetStatus(status); + return input_iter; + } else { + return NewErrorInternalIterator(status); + } +} + +Status ReturnNoIOError() { + return STATUS(Incomplete, "no blocking io"); +} + } // namespace // Originally following data was stored in BlockBasedTable::Rep and related to a single SST file. @@ -218,9 +254,8 @@ struct BlockBasedTable::Rep { Footer footer; std::mutex data_index_reader_mutex; yb::AtomicUniquePtr data_index_reader; - unique_ptr data_index_iterator_state; - unique_ptr filter_index_reader; - unique_ptr filter; + std::unique_ptr filter_index_reader; + std::unique_ptr filter; FilterType filter_type; @@ -245,9 +280,19 @@ struct BlockBasedTable::Rep { yb::MemTrackerPtr mem_tracker; }; -// BlockEntryIteratorState doesn't actually store any iterator state and is only used as an adapter -// to BlockBasedTable. It is used by TwoLevelIterator and MultiLevelIterator to call BlockBasedTable -// functions in order to check if prefix may match or to create a secondary iterator. +struct BlockBasedTable::BlockRetrievalInfo { + BlockType type; + BlockHandle handle; + RandomAccessFileReader* file_reader; + Slice cache_key; + Slice compressed_cache_key; + + char cache_key_buf[block_based_table::kCacheKeyBufferSize]; + char compressed_cache_key_buf[block_based_table::kCacheKeyBufferSize]; +}; + +// BlockEntryIteratorState is used by TwoLevelIterator and MultiLevelIterator in order to check if +// key prefix may match the filter of the SST file or to create a secondary iterator. class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { public: BlockEntryIteratorState( @@ -257,10 +302,95 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { table_(table), read_options_(read_options), skip_filters_(skip_filters), - block_type_(block_type) {} + block_type_(block_type), + statistics_( + (PREDICT_FALSE(FLAGS_TEST_rocksdb_record_readahead_stats_only_for_data_blocks) && + block_type_ != BlockType::kData) + ? nullptr + : read_options.statistics ? read_options.statistics + : table_->rep_->ioptions.statistics) {} InternalIterator* NewSecondaryIterator(const Slice& index_value) override { - return table_->NewDataBlockIterator(read_options_, index_value, block_type_); + PERF_TIMER_GUARD(new_table_block_iter_nanos); + + BlockRetrievalInfo block_info; + { + auto status = table_->GetBlockRetrievalInfo(index_value, block_type_, &block_info); + if (!status.ok()) { + return NewErrorInternalIterator(status); + } + } + auto block_res = table_->GetBlockFromCache(read_options_, block_info); + const auto& handle = block_info.handle; + VLOG_WITH_FUNC(5) << "handle: " << handle.ToDebugString() + << " num_sequential_disk_reads_: " << num_sequential_disk_reads_ + << " prev_offset_: " << prev_offset_ << " prev_length_: " << prev_length_ + << " block from cache: " + << (block_res.ok() ? yb::AsString(static_cast(block_res->value)) + : AsString(block_res.status())) + << " for file: " << block_info.file_reader->file()->filename(); + + const auto is_sequential_read = IsSequentialRead(handle); + if (!is_sequential_read && + (num_sequential_disk_reads_ > 0 || readahead_limit_ > 0)) { + VLOG_WITH_FUNC(4) << "handle: " << handle.ToDebugString() << " prev_offset_: " << prev_offset_ + << " prev_length_: " << prev_length_ << " skip_size: " + << static_cast(handle.offset() - prev_offset_ - prev_length_) + << " readahead_size_: " << readahead_size_ + << ". Resetting readahead for iterator for file: " + << block_info.file_reader->file()->filename(); + ResetReadahead(); + } + + if (block_res.ok() && !block_res->value) { + // Automatically prefetch additional data when a range scan (iterator) does + // sequential_disk_reads_for_auto_readahead_ sequential IOs. + // TODO(readahead): look into implementing readahead for backward scan. + if (sequential_disk_reads_for_auto_readahead_ > 0) { + const auto is_block_after_readahead_limit = + handle.offset() + handle.size() + kBlockTrailerSize > readahead_limit_; + + // 1) If we are reading within readahead window it doesn't cause disk read, so no need to + // count it. + // 2) Count first disk read in a row. + if (is_block_after_readahead_limit && + (is_sequential_read || num_sequential_disk_reads_ == 0)) { + ++num_sequential_disk_reads_; + } + + // If sequential_disk_reads_for_auto_readahead_ = N, we want to start readahead with the Nth + // sequential disk read in a row. + // For example, for N = 2, initial readahead size = 4096, max readahead size = 8192: + // 1st disk read: offset = 0, length = 1024, num_sequential_disk_reads_ = 1 - just read 1024 + // bytes. + // 2nd disk read: offset = 1024, length = 1024, num_sequential_disk_reads_ = 2 - readahead + // 4096 bytes until offset 1024 + 4096 = 5120. + // ... reads from readahead window ... + // 3rd disk read: offset = 5120, length = 1024, num_sequential_disk_reads_ = 3 - readahead + // 4096*2 = 8192 bytes until offset 5120 + 8192 = 13312. + if ((num_sequential_disk_reads_ >= sequential_disk_reads_for_auto_readahead_) && + is_block_after_readahead_limit) { + VLOG_WITH_FUNC(4) << "handle: " << handle.ToDebugString() + << " calling readahead with size: " << readahead_size_ + << " for file: " << block_info.file_reader->file()->filename(); + block_info.file_reader->Readahead(handle.offset(), readahead_size_); + RecordTick(statistics_, READAHEAD_CALLS); + RecordTick(statistics_, READAHEAD_BYTES_READ, readahead_size_); + readahead_limit_ = handle.offset() + readahead_size_; + // Keep exponentially increasing readahead size up to max_readahead_size_. + readahead_size_ = std::min(readahead_size_ * 2, max_readahead_size_); + } + } + + block_res = table_->ReadBlockFromFileAndMaybePutToCache(read_options_, block_info); + } + UpdateReadPattern(handle); + + if (!block_res.ok()) { + return NewErrorInternalIterator(block_res.status()); + } + + return table_->NewBlockIterator(block_res.get_ptr(), block_type_, /* input_iter = */ nullptr); } bool PrefixMayMatch(const Slice& internal_key) override { @@ -271,6 +401,22 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { } private: + void ResetReadahead() { + num_sequential_disk_reads_ = 0; + readahead_size_ = initial_readahead_size_; + readahead_limit_ = 0; + RecordTick(statistics_, READAHEAD_RESET); + } + + void UpdateReadPattern(BlockHandle handle) { + prev_offset_ = handle.offset(); + prev_length_ = handle.size() + kBlockTrailerSize; + } + + bool IsSequentialRead(BlockHandle handle) { + return handle.offset() == prev_offset_ + prev_length_; + } + // Don't own table_. BlockEntryIteratorState should only be stored in iterators or in // corresponding BlockBasedTable. TableReader (superclass of BlockBasedTable) is only destroyed // after iterator is deleted. @@ -278,6 +424,19 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { const ReadOptions read_options_; const bool skip_filters_; const BlockType block_type_; + Statistics* const statistics_; + + const uint64_t sequential_disk_reads_for_auto_readahead_ = + yb::GetAtomicFlag(&FLAGS_rocksdb_iterator_sequential_disk_reads_for_auto_readahead); + const size_t max_readahead_size_ = yb::GetAtomicFlag(&FLAGS_rocksdb_iterator_max_readahead_size); + const size_t initial_readahead_size_ = std::min( + yb::GetAtomicFlag(&FLAGS_rocksdb_iterator_init_readahead_size), max_readahead_size_); + + size_t readahead_size_ = initial_readahead_size_; + size_t prev_offset_ = std::numeric_limits::max(); + size_t prev_length_ = 0; + uint64_t num_sequential_disk_reads_ = 0; + size_t readahead_limit_ = 0; }; @@ -424,12 +583,6 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, SetupCacheKeyPrefix(rep, rep->base_reader_with_cache_prefix.get()); unique_ptr new_table(new BlockBasedTable(rep)); - // rep->data_index_iterator_state must be instantiated before the first call of - // `BlockBasedTable::CreateDataBlockIndexReader` which might happpen for PRELOAD_ON_OPEN. - const bool skip_filters_for_index = true; - rep->data_index_iterator_state = std::make_unique( - new_table.get(), ReadOptions::kDefault, skip_filters_for_index, BlockType::kIndex); - // Read meta index std::unique_ptr meta; std::unique_ptr meta_iter; @@ -766,7 +919,6 @@ Status BlockBasedTable::GetDataBlockFromCache( const ReadOptions& read_options, BlockBasedTable::CachableEntry* block, uint32_t format_version, BlockType block_type, const std::shared_ptr& mem_tracker) { - Status s; Block* compressed_block = nullptr; Cache::Handle* block_cache_compressed_handle = nullptr; @@ -779,7 +931,7 @@ Status BlockBasedTable::GetDataBlockFromCache( if (block->cache_handle != nullptr) { block->value = static_cast(block_cache->Value(block->cache_handle)); - return s; + return Status::OK(); } } @@ -787,7 +939,7 @@ Status BlockBasedTable::GetDataBlockFromCache( assert(block->cache_handle == nullptr && block->value == nullptr); if (block_cache_compressed == nullptr) { - return s; + return Status::OK(); } assert(!compressed_block_cache_key.empty()); @@ -797,7 +949,7 @@ Status BlockBasedTable::GetDataBlockFromCache( // uncompressed cache if (block_cache_compressed_handle == nullptr) { RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); - return s; + return Status::OK(); } // found compressed block @@ -808,7 +960,7 @@ Status BlockBasedTable::GetDataBlockFromCache( // Retrieve the uncompressed contents into a new buffer BlockContents contents; - s = UncompressBlockContents(compressed_block->data(), compressed_block->size(), &contents, + auto s = UncompressBlockContents(compressed_block->data(), compressed_block->size(), &contents, format_version, mem_tracker); // Insert uncompressed block into block cache @@ -1084,23 +1236,6 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( return { filter, cache_handle }; } -namespace { - -InternalIterator* ReturnErrorIterator(const Status& status, BlockIter* input_iter) { - if (input_iter != nullptr) { - input_iter->SetStatus(status); - return input_iter; - } else { - return NewErrorInternalIterator(status); - } -} - -Status ReturnNoIOError() { - return STATUS(Incomplete, "no blocking io"); -} - -} // namespace - yb::Result> BlockBasedTable::GetIndexReader( const ReadOptions& read_options) { auto* index_reader = rep_->data_index_reader.get(std::memory_order_acquire); @@ -1173,8 +1308,13 @@ InternalIterator* BlockBasedTable::NewIndexIterator( return ReturnErrorIterator(index_reader_result.status(), input_iter); } + const bool skip_filters_for_index = true; + auto* new_iter = index_reader_result->value->NewIterator( - input_iter, rep_->data_index_iterator_state.get(), read_options.total_order_seek); + input_iter, + std::make_unique( + this, ReadOptions::kDefault, skip_filters_for_index, BlockType::kIndex), + read_options.total_order_seek); if (index_reader_result->cache_handle) { auto iter = new_iter ? new_iter : input_iter; @@ -1190,114 +1330,125 @@ InternalIterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_opti return NewIndexIterator(read_options, /* input_iter = */ nullptr); } -yb::Result> BlockBasedTable::RetrieveBlock( - const ReadOptions& ro, const Slice& index_value, - const BlockType block_type, const bool use_cache) { - const bool no_io = (ro.read_tier == kBlockCacheTier); - Cache* block_cache = rep_->table_options.block_cache.get(); - Cache* block_cache_compressed = rep_->table_options.block_cache_compressed.get(); - CachableEntry block; - - BlockHandle handle; - Slice input = index_value; - - // We intentionally allow extra stuff in index_value so that we - // can add more features in the future. - RETURN_NOT_OK(handle.DecodeFrom(&input)); +// Using status return value because returning Result leads to +// 1.26x higher latency for scanG4_pkey_INclause_rangescan_increasing_rows_1000row workload on RF=1 +// local cluster with release build. +// The size of the Result's underlying type matters and may provide a noticeable degradation in +// hot paths due to a compiler is not always able to use RVO/elide constructors and the Result +// with the corresponding type could be copied. +Status BlockBasedTable::GetBlockRetrievalInfo( + Slice index_value, const BlockType block_type, BlockBasedTable::BlockRetrievalInfo* info) { + info->type = block_type; - FileReaderWithCachePrefix* reader = GetBlockReader(block_type); - - // If either block cache is enabled, we'll try to read from it. - if (PREDICT_TRUE(use_cache) && (block_cache != nullptr || block_cache_compressed != nullptr)) { - Statistics* statistics = ro.statistics ? ro.statistics : rep_->ioptions.statistics; - char cache_key[block_based_table::kCacheKeyBufferSize]; - char compressed_cache_key[block_based_table::kCacheKeyBufferSize]; - Slice key, /* key to the block cache */ - ckey /* key to the compressed block cache */; + // We intentionally allow extra stuff in index_value so that we can add more features in the + // future. + RETURN_NOT_OK(info->handle.DecodeFrom(&index_value)); - // create key for block cache - if (block_cache != nullptr) { - key = GetCacheKey(reader->cache_key_prefix, handle, cache_key); - } + auto* reader = GetBlockReader(block_type); + info->file_reader = reader->reader.get(); - if (block_cache_compressed != nullptr) { - ckey = GetCacheKey(reader->compressed_cache_key_prefix, handle, compressed_cache_key); - } + // Create key for block cache. + if (rep_->table_options.block_cache) { + info->cache_key = GetCacheKey(reader->cache_key_prefix, info->handle, info->cache_key_buf); + } - Status status = GetDataBlockFromCache( - key, ckey, block_cache, block_cache_compressed, statistics, ro, &block, - rep_->table_options.format_version, block_type, rep_->mem_tracker); + if (rep_->table_options.block_cache_compressed) { + info->compressed_cache_key = GetCacheKey( + reader->compressed_cache_key_prefix, info->handle, info->compressed_cache_key_buf); + } - if (block.value == nullptr && !no_io && ro.fill_cache) { - std::unique_ptr raw_block; - { - StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); - RETURN_NOT_OK(block_based_table::ReadBlockFromFile( - reader->reader.get(), rep_->footer, ro, handle, &raw_block, rep_->ioptions.env, - rep_->mem_tracker, block_cache_compressed == nullptr)); - } + return Status::OK(); +} - RETURN_NOT_OK(PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, - ro, statistics, &block, raw_block.release(), - rep_->table_options.format_version, rep_->mem_tracker)); - status = Status::OK(); - } +yb::Result> BlockBasedTable::GetBlockFromCache( + const ReadOptions& ro, const BlockRetrievalInfo& block_info) { + Cache* block_cache = rep_->table_options.block_cache.get(); + Cache* block_cache_compressed = rep_->table_options.block_cache_compressed.get(); + CachableEntry block; - RETURN_NOT_OK(status); + // If either block cache is enabled, we'll try to read from it. + if ((block_cache != nullptr || block_cache_compressed != nullptr)) { + Statistics* statistics = ro.statistics ? ro.statistics : rep_->ioptions.statistics; + RETURN_NOT_OK(GetDataBlockFromCache( + block_info.cache_key, block_info.compressed_cache_key, block_cache, block_cache_compressed, + statistics, ro, &block, rep_->table_options.format_version, block_info.type, + rep_->mem_tracker)); } - // Got data from block caches. - if (block.value) { - return block; - } + return block; +} - // Could not read from block_cache and can't do IO. - if (no_io) { +yb::Result> +BlockBasedTable::ReadBlockFromFileAndMaybePutToCache( + const ReadOptions& ro, const BlockBasedTable::BlockRetrievalInfo& block_info) { + if (ro.read_tier == kBlockCacheTier) { return ReturnNoIOError(); } - std::unique_ptr block_value; - RETURN_NOT_OK(block_based_table::ReadBlockFromFile( - reader->reader.get(), rep_->footer, ro, handle, &block_value, rep_->ioptions.env, - rep_->mem_tracker)); + auto* block_cache = rep_->table_options.block_cache.get(); + auto* block_cache_compressed = rep_->table_options.block_cache_compressed.get(); - block.value = block_value.release(); - RSTATUS_DCHECK(block.value, Incomplete, "No data block"); // Not expected to happen. + Statistics* statistics = ro.statistics ? ro.statistics : rep_->ioptions.statistics; + std::unique_ptr raw_block; + { + // Don't uncompress for now if we need to fill compressed block cache. + // It will be uncompressed by PutDataBlockToCache. + const auto skip_uncompress = ro.fill_cache && block_cache_compressed != nullptr; + StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS); + RETURN_NOT_OK(block_based_table::ReadBlockFromFile( + block_info.file_reader, rep_->footer, ro, block_info.handle, &raw_block, rep_->ioptions.env, + rep_->mem_tracker, /* do_uncompress = */ !skip_uncompress)); + } + + CachableEntry block; + if (ro.fill_cache && (block_cache != nullptr || block_cache_compressed != nullptr)) { + RETURN_NOT_OK(PutDataBlockToCache( + block_info.cache_key, block_info.compressed_cache_key, block_cache, block_cache_compressed, + ro, statistics, &block, raw_block.release(), rep_->table_options.format_version, + rep_->mem_tracker)); + } else { + block.value = raw_block.release(); + } + + RSTATUS_DCHECK(block.value, Incomplete, "No block"); // Not expected to happen. return block; } -yb::Result> BlockBasedTable::RetrieveBlockFromFile(const ReadOptions& ro, - const Slice& index_value, const BlockType block_type) { - auto block = VERIFY_RESULT(RetrieveBlock(ro, index_value, block_type, /* use_cache = */ false)); - CHECK(block.cache_handle == nullptr); // We requested no cache at previous command. - return std::unique_ptr(block.value); +yb::Result> BlockBasedTable::RetrieveBlock( + const ReadOptions& ro, const Slice index_value, const BlockType block_type) { + BlockRetrievalInfo block_info; + RETURN_NOT_OK(GetBlockRetrievalInfo(index_value, block_type, &block_info)); + auto block_res = GetBlockFromCache(ro, block_info); + if (block_res.ok() && !block_res->value) { + return ReadBlockFromFileAndMaybePutToCache(ro, block_info); + } + return block_res; +} + +InternalIterator* BlockBasedTable::NewBlockIterator( + BlockBasedTable::CachableEntry* block, BlockType block_type, BlockIter* input_iter) { + InternalIterator* iter = block->value->NewIterator( + rep_->comparator.get(), GetKeyValueEncodingFormat(block_type), input_iter); + if (block->cache_handle) { + Cache* block_cache = rep_->table_options.block_cache.get(); + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, block->cache_handle); + } else { + iter->RegisterCleanup(&DeleteHeldResource, block->value, nullptr); + } + return iter; } -InternalIterator* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro, - const Slice& index_value, BlockType block_type, BlockIter* input_iter) { +InternalIterator* BlockBasedTable::NewBlockIterator( + const ReadOptions& ro, const Slice index_value, BlockType block_type, BlockIter* input_iter) { PERF_TIMER_GUARD(new_table_block_iter_nanos); auto block = RetrieveBlock(ro, index_value, block_type); - if (block) { - InternalIterator* iter = block->value->NewIterator( - rep_->comparator.get(), GetKeyValueEncodingFormat(block_type), input_iter); - if (block->cache_handle) { - Cache* block_cache = rep_->table_options.block_cache.get(); - iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, block->cache_handle); - } else { - iter->RegisterCleanup(&DeleteHeldResource, block->value, nullptr); - } - return iter; + if (!block.ok()) { + return ReturnErrorIterator(block.status(), input_iter); } - // Failure happened, return corresponding iterator with an error. - if (!input_iter) { - return NewErrorInternalIterator(block.status()); - } else { - input_iter->SetStatus(block.status()); - return input_iter; - } + return NewBlockIterator(block.get_ptr(), block_type, input_iter); } // This will be broken if the user specifies an unusual implementation @@ -1486,7 +1637,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& intern } BlockIter biter; - NewDataBlockIterator(read_options, iiter.value(), BlockType::kData, &biter); + NewBlockIterator(read_options, iiter.value(), BlockType::kData, &biter); if (read_options.read_tier == kBlockCacheTier && biter.status().IsIncomplete()) { @@ -1556,7 +1707,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, // Load the block specified by the block_handle into the block cache BlockIter biter; - NewDataBlockIterator(ReadOptions::kDefault, block_handle, BlockType::kData, &biter); + NewBlockIterator(ReadOptions::kDefault, block_handle, BlockType::kData, &biter); if (!biter.status().ok()) { // there was an unexpected error while pre-fetching @@ -1891,8 +2042,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { std::unique_ptr datablock_iter; datablock_iter.reset( - NewDataBlockIterator( - ReadOptions::kDefault, blockhandles_iter->value(), BlockType::kData)); + NewBlockIterator(ReadOptions::kDefault, blockhandles_iter->value(), BlockType::kData)); s = datablock_iter->status(); if (!s.ok()) { @@ -1981,9 +2131,18 @@ yb::Result BlockBasedTable::GetMiddleKey() { } auto data_block = VERIFY_RESULT( - RetrieveBlockFromFile(ReadOptions::kDefault, index_iter->value(), BlockType::kData)); - return data_block->GetMiddleKey(GetKeyValueEncodingFormat(BlockType::kData), - rep_->comparator.get(), MiddlePointPolicy::kMiddleHigh); + RetrieveBlock(ReadOptions::kDefault, index_iter->value(), BlockType::kData)); + auto middle_key_res = data_block.value->GetMiddleKey( + GetKeyValueEncodingFormat(BlockType::kData), rep_->comparator.get(), + MiddlePointPolicy::kMiddleHigh); + // TODO: consider porting https://github.com/facebook/rocksdb/pull/5252 and then + // https://github.com/facebook/rocksdb/pull/12694 + if (data_block.cache_handle) { + data_block.Release(rep_->table_options.block_cache.get()); + } else { + delete data_block.value; + } + return middle_key_res; } yb::Result BlockBasedTable::TEST_GetIndexReader() { @@ -2001,4 +2160,16 @@ yb::Result BlockBasedTable::TEST_GetIndexReader() { return index_reader_ptr; } +yb::Result BlockBasedTable::TEST_GetBlockNumRestarts( + const ReadOptions& ro, const Slice index_value, BlockType block_type) { + auto data_block = VERIFY_RESULT(RetrieveBlock(ro, index_value, block_type)); + auto num_restarts = data_block.value->NumRestarts(); + if (data_block.cache_handle) { + data_block.Release(rep_->table_options.block_cache.get()); + } else { + delete data_block.value; + } + return num_restarts; +} + } // namespace rocksdb diff --git a/src/yb/rocksdb/table/block_based_table_reader.h b/src/yb/rocksdb/table/block_based_table_reader.h index af66bbf2b787..8921412077b6 100644 --- a/src/yb/rocksdb/table/block_based_table_reader.h +++ b/src/yb/rocksdb/table/block_based_table_reader.h @@ -192,20 +192,12 @@ class BlockBasedTable : public TableReader { InternalIterator* NewIndexIterator(const ReadOptions& read_options) override; - // Converts an index entry (i.e. an encoded BlockHandle) into an iterator over the contents of - // a correspoding block. Updates and returns input_iter if the one is specified, or returns - // a new iterator. - InternalIterator* NewDataBlockIterator( - const ReadOptions& ro, const Slice& index_value, BlockType block_type, - BlockIter* input_iter = nullptr); - const ImmutableCFOptions& ioptions(); yb::Result GetMiddleKey() override; - // Helper function that force reading block from a file and takes care about block cleanup. - yb::Result> RetrieveBlockFromFile(const ReadOptions& ro, - const Slice& index_value, BlockType block_type); + yb::Result TEST_GetBlockNumRestarts( + const ReadOptions& ro, const Slice index_value, BlockType block_type); ~BlockBasedTable(); @@ -217,6 +209,8 @@ class BlockBasedTable : public TableReader { yb::Result TEST_GetIndexReader(); private: + struct BlockRetrievalInfo; + template struct CachableEntry; @@ -300,6 +294,15 @@ class BlockBasedTable : public TableReader { std::unique_ptr* index_reader, InternalIterator* preloaded_meta_index_iter = nullptr); + // Converts an index entry (i.e. an encoded BlockHandle) into an iterator over the contents of + // a corresponding block (data block or lower level index block). Updates and returns input_iter + // if the one is specified, or returns a new iterator. + InternalIterator* NewBlockIterator( + const ReadOptions& ro, const Slice index_value, BlockType block_type, + BlockIter* input_iter = nullptr); + InternalIterator* NewBlockIterator( + CachableEntry* block, BlockType block_type, BlockIter* input_iter); + bool NonBlockBasedFilterKeyMayMatch(FilterBlockReader* filter, const Slice& filter_key) const; Status ReadPropertiesBlock(InternalIterator* meta_iter); @@ -324,10 +327,17 @@ class BlockBasedTable : public TableReader { FileReaderWithCachePrefix* GetBlockReader(BlockType block_type) const; KeyValueEncodingFormat GetKeyValueEncodingFormat(BlockType block_type) const; + Status GetBlockRetrievalInfo( + Slice index_value, const BlockType block_type, BlockRetrievalInfo *info); + // Retrieves block from file system or cache. // NOTE! A caller is responsible for a block cleanup. - yb::Result> RetrieveBlock(const ReadOptions& ro, const Slice& index_value, - BlockType block_type, bool use_cache = true); + yb::Result> RetrieveBlock( + const ReadOptions& ro, const Slice index_value, BlockType block_type); + yb::Result> GetBlockFromCache( + const ReadOptions& ro, const BlockRetrievalInfo& info); + yb::Result> ReadBlockFromFileAndMaybePutToCache( + const ReadOptions& ro, const BlockRetrievalInfo& info); explicit BlockBasedTable(Rep* rep) : rep_(rep) {} diff --git a/src/yb/rocksdb/table/index_reader.cc b/src/yb/rocksdb/table/index_reader.cc index 566dce8b2a32..6c0860cb6ea7 100644 --- a/src/yb/rocksdb/table/index_reader.cc +++ b/src/yb/rocksdb/table/index_reader.cc @@ -138,9 +138,9 @@ class MultiLevelIterator final : public InternalIterator { static constexpr auto kIterChainInitialCapacity = 4; MultiLevelIterator( - TwoLevelIteratorState* state, InternalIterator* top_level_iter, uint32_t num_levels, - bool need_free_top_level_iter) - : state_(state), iter_(num_levels), index_block_handle_(num_levels - 1), + std::unique_ptr state, InternalIterator* top_level_iter, + uint32_t num_levels, bool need_free_top_level_iter) + : state_(std::move(state)), iter_(num_levels), index_block_handle_(num_levels - 1), bottom_level_iter_(iter_.data() + (num_levels - 1)), need_free_top_level_iter_(need_free_top_level_iter) { iter_[0].Set(top_level_iter); @@ -274,7 +274,7 @@ class MultiLevelIterator final : public InternalIterator { } } - TwoLevelIteratorState* const state_; + std::unique_ptr const state_; boost::container::small_vector iter_; // If iter_[level] holds non-nullptr, then "index_block_handle_[level-1]" holds the // handle passed to state_->NewSecondaryIterator to create iter_[level]. @@ -298,11 +298,11 @@ Result> MultiLevelIndexReader::Create( } InternalIterator* MultiLevelIndexReader::NewIterator( - BlockIter* iter, TwoLevelIteratorState* index_iterator_state, bool) { + BlockIter* iter, std::unique_ptr index_iterator_state, bool) { InternalIterator* top_level_iter = top_level_index_block_->NewIndexIterator( comparator_.get(), iter, /* total_order_seek = */ true); return new MultiLevelIterator( - index_iterator_state, top_level_iter, num_levels_, top_level_iter != iter); + std::move(index_iterator_state), top_level_iter, num_levels_, top_level_iter != iter); } Result MultiLevelIndexReader::GetMiddleKey() const { diff --git a/src/yb/rocksdb/table/index_reader.h b/src/yb/rocksdb/table/index_reader.h index b128ab49a599..4a29e949d33b 100644 --- a/src/yb/rocksdb/table/index_reader.h +++ b/src/yb/rocksdb/table/index_reader.h @@ -52,9 +52,10 @@ class IndexReader { // - top level index block iterator is passed and updated instead of the whole index iterator, // but return semantic is the same - the whole index iterator is returned. // - index_iterator_state is used to create secondary iterators on index. - virtual InternalIterator* NewIterator(BlockIter* iter = nullptr, - TwoLevelIteratorState* index_iterator_state = nullptr, - bool total_order_seek = true) = 0; + virtual InternalIterator* NewIterator( + BlockIter* iter = nullptr, + std::unique_ptr index_iterator_state = nullptr, + bool total_order_seek = true) = 0; // Returns approximate middle key from the index. Key from the index might not match any key // actually written to SST file, because keys could be shortened and substituted before them are @@ -91,7 +92,8 @@ class BinarySearchIndexReader : public IndexReader { InternalIterator* NewIterator( BlockIter* iter = nullptr, // Rest of parameters are ignored by BinarySearchIndexReader. - TwoLevelIteratorState* state = nullptr, bool total_order_seek = true) override { + std::unique_ptr state = nullptr, + bool total_order_seek = true) override { auto new_iter = index_block_->NewIndexIterator(comparator_.get(), iter, true); return iter ? nullptr : new_iter; } @@ -136,7 +138,7 @@ class HashIndexReader : public IndexReader { bool hash_index_allow_collision, const std::shared_ptr& mem_tracker); InternalIterator* NewIterator( - BlockIter* iter = nullptr, TwoLevelIteratorState* state = nullptr, + BlockIter* iter = nullptr, std::unique_ptr state = nullptr, bool total_order_seek = true) override { auto new_iter = index_block_->NewIndexIterator(comparator_.get(), iter, total_order_seek); return iter ? nullptr : new_iter; @@ -196,7 +198,7 @@ class MultiLevelIndexReader : public IndexReader { ~MultiLevelIndexReader() {} InternalIterator* NewIterator( - BlockIter* iter, TwoLevelIteratorState* index_iterator_state, bool) override; + BlockIter* iter, std::unique_ptr index_iterator_state, bool) override; Result GetMiddleKey() const override; diff --git a/src/yb/rocksdb/util/file_reader_writer.h b/src/yb/rocksdb/util/file_reader_writer.h index 14c1bcf1c07c..fa1ee55b1435 100644 --- a/src/yb/rocksdb/util/file_reader_writer.h +++ b/src/yb/rocksdb/util/file_reader_writer.h @@ -121,6 +121,8 @@ class RandomAccessFileReader { uint64_t offset, size_t n, Slice* result, char* scratch, const yb::ReadValidator& validator, Statistics* statistics = nullptr); + void Readahead(size_t offset, size_t length) { file_->Readahead(offset, length); } + RandomAccessFile* file() { return file_.get(); } }; diff --git a/src/yb/rocksdb/util/statistics.cc b/src/yb/rocksdb/util/statistics.cc index c3a7f847365b..3d4525dd3ab9 100644 --- a/src/yb/rocksdb/util/statistics.cc +++ b/src/yb/rocksdb/util/statistics.cc @@ -133,6 +133,10 @@ constexpr std::pair TickersNameMap[] = { {COMPACTION_FILES_FILTERED, "rocksdb_compaction_files_filtered"}, {COMPACTION_FILES_NOT_FILTERED, "rocksdb_compaction_files_not_filtered"}, + + {READAHEAD_RESET, "rocksdb_readahead_reset"}, + {READAHEAD_CALLS, "rocksdb_readahead_calls"}, + {READAHEAD_BYTES_READ, "rocksdb_readahead_bytes_read"}, }; constexpr std::pair HistogramsNameMap[] = { @@ -142,6 +146,7 @@ constexpr std::pair HistogramsNameMap[] = { {WAL_FILE_SYNC_MICROS, "rocksdb_wal_file_sync_micros"}, {DB_MULTIGET, "rocksdb_db_multiget_micros"}, {READ_BLOCK_COMPACTION_MICROS, "rocksdb_read_block_compaction_micros"}, + // Only takes into account block reads by RocksDB iterator and Get? {READ_BLOCK_GET_MICROS, "rocksdb_read_block_get_micros"}, {WRITE_RAW_BLOCK_MICROS, "rocksdb_write_raw_block_micros"}, {NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb_numfiles_in_singlecompaction"}, diff --git a/src/yb/util/endian_util.h b/src/yb/util/endian_util.h new file mode 100644 index 000000000000..b4868de1a77b --- /dev/null +++ b/src/yb/util/endian_util.h @@ -0,0 +1,52 @@ +// Copyright (c) YugabyteDB, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// + +#pragma once + +#include "yb/gutil/endian.h" + +#include "yb/util/slice.h" +#include "yb/util/status.h" + +namespace yb { + +// Write value encoded with specified endian and update output address. +template +std::enable_if_t Write(SingleByteType*& p, T v) { + Store(p, v); + p += sizeof(T); +} + +// Read value encoded with specified endian and update input address. +template +std::enable_if_t Read(SingleByteType*& p) { + auto ptr = p; + p += sizeof(T); + return Load(ptr); +} + +// Read value encoded with specified endian from slice and remove read prefix. +// Return failure if value cannot be read from the slice. +template +Result CheckedRead(Slice& slice) { + if (slice.size() < sizeof(T)) { + return STATUS_FORMAT( + Corruption, "Not enough bytes to read: $0, need $1", slice.size(), sizeof(T)); + } + + auto ptr = slice.data(); + slice.RemovePrefix(sizeof(T)); + return Load(ptr); +} + +} // namespace yb diff --git a/src/yb/util/file_system.h b/src/yb/util/file_system.h index 7b664ce7f668..176056743dd9 100644 --- a/src/yb/util/file_system.h +++ b/src/yb/util/file_system.h @@ -184,6 +184,8 @@ class RandomAccessFile : public FileWithUniqueId { // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. virtual Status InvalidateCache(size_t offset, size_t length); + + virtual void Readahead(size_t offset, size_t length) {} }; } // namespace yb diff --git a/src/yb/util/file_system_posix.cc b/src/yb/util/file_system_posix.cc index 55c63d4b811f..5205f8f22881 100644 --- a/src/yb/util/file_system_posix.cc +++ b/src/yb/util/file_system_posix.cc @@ -278,6 +278,17 @@ Status PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) { #endif } +void PosixRandomAccessFile::Readahead(size_t offset, size_t length) { +#ifdef __linux__ + auto ret = readahead(fd_, implicit_cast(offset), length); + if (ret == 0) { + return; + } + YB_LOG_EVERY_N_SECS(ERROR, 60) << "Readahead error for " << filename_ << " at " << offset + << ", length=" << length << ": " << ErrnoToString(errno); +#endif +} + } // namespace yb namespace rocksdb { diff --git a/src/yb/util/file_system_posix.h b/src/yb/util/file_system_posix.h index 4243068419d2..4bd23a7fb8a2 100644 --- a/src/yb/util/file_system_posix.h +++ b/src/yb/util/file_system_posix.h @@ -60,8 +60,10 @@ class PosixRandomAccessFile : public RandomAccessFile { #ifdef __linux__ virtual size_t GetUniqueId(char* id) const override; #endif - virtual void Hint(AccessPattern pattern) override; - virtual Status InvalidateCache(size_t offset, size_t length) override; + void Hint(AccessPattern pattern) override; + Status InvalidateCache(size_t offset, size_t length) override; + + void Readahead(size_t offset, size_t length) override; private: std::string filename_; diff --git a/src/yb/util/range.h b/src/yb/util/range.h index 4f3ed8c44c67..bfd94e830101 100644 --- a/src/yb/util/range.h +++ b/src/yb/util/range.h @@ -108,6 +108,23 @@ class RangeIterator : public std::iterator Int step_; }; +template +class RangeObject; + +template +class RangeObjectToContainerHelper { + public: + explicit RangeObjectToContainerHelper(const RangeObject& range) : range_(range) {} + + template + operator Out() const { + return Out(range_.begin(), range_.end()); + } + + private: + const RangeObject& range_; +}; + template class RangeObject { public: @@ -142,6 +159,10 @@ class RangeObject { return {new_start, new_stop, new_step}; } + RangeObjectToContainerHelper ToContainer() const { + return RangeObjectToContainerHelper(*this); + } + private: [[nodiscard]] static Int NormalizedStop(Int start, Int stop, Int step) { auto diff = stop - start; diff --git a/src/yb/util/slice.cc b/src/yb/util/slice.cc index 5994717832ab..725cc1e2e974 100644 --- a/src/yb/util/slice.cc +++ b/src/yb/util/slice.cc @@ -145,7 +145,7 @@ uint8_t Slice::operator[](size_t n) const { return begin_[n]; } -void Slice::remove_prefix(size_t n) { +void Slice::RemovePrefix(size_t n) { DCHECK_LE(n, size()); begin_ += n; } @@ -164,7 +164,7 @@ Slice Slice::WithoutPrefix(size_t n) const { return Slice(begin_ + n, end_); } -void Slice::remove_suffix(size_t n) { +void Slice::RemoveSuffix(size_t n) { DCHECK_LE(n, size()); end_ -= n; } diff --git a/src/yb/util/slice.h b/src/yb/util/slice.h index d3c845115603..e8166b6d8fc0 100644 --- a/src/yb/util/slice.h +++ b/src/yb/util/slice.h @@ -131,13 +131,21 @@ class Slice { uint8_t operator[](size_t n) const; // Change this slice to refer to an empty array - void clear() { + void Clear() { begin_ = to_uchar_ptr(""); end_ = begin_; } + [[deprecated]] void clear() { + Clear(); + } + // Drop the first "n" bytes from this slice. - void remove_prefix(size_t n); + void RemovePrefix(size_t n); + + [[deprecated]] void remove_prefix(size_t n) { + RemovePrefix(n); + } Slice Prefix(size_t n) const; @@ -147,7 +155,11 @@ class Slice { Slice WithoutPrefix(size_t n) const; // Drop the last "n" bytes from this slice. - void remove_suffix(size_t n); + void RemoveSuffix(size_t n); + + [[deprecated]] void remove_suffix(size_t n) { + RemoveSuffix(n); + } Slice Suffix(size_t n) const; diff --git a/src/yb/yql/pgwrapper/pg_ddl_atomicity_stress-test.cc b/src/yb/yql/pgwrapper/pg_ddl_atomicity_stress-test.cc index edabeda9a5fc..891efe89e851 100644 --- a/src/yb/yql/pgwrapper/pg_ddl_atomicity_stress-test.cc +++ b/src/yb/yql/pgwrapper/pg_ddl_atomicity_stress-test.cc @@ -48,7 +48,13 @@ const auto kTable = "test_table"; namespace yb { namespace pgwrapper { -class PgDdlAtomicityStressTest : public PgDdlAtomicityTestBase { +YB_STRONGLY_TYPED_BOOL(ColocatedDatabase); +YB_STRONGLY_TYPED_BOOL(PartitionedTables); + +class PgDdlAtomicityStressTest + : public PgDdlAtomicityTestBase, + public ::testing::WithParamInterface> { protected: void UpdateMiniClusterOptions(ExternalMiniClusterOptions* options) override { options->extra_tserver_flags.push_back("--yb_enable_read_committed_isolation=false"); @@ -57,24 +63,34 @@ class PgDdlAtomicityStressTest : public PgDdlAtomicityTestBase { options->extra_master_flags.push_back("--ysql_ddl_transaction_wait_for_ddl_verification=false"); } - Status StressTestWithFlag(const std::string& error_probability); + Status SetupTables(); + + Result Connect(); + + int NumIterations(); - virtual Status SetupTables(); + std::string database(); - virtual Result Connect() { - return LibPqTestBase::Connect(); + // Return a cached global connection that is used for test table/data setup, and test + // result verification. + PGConn* GetGlobalConn() { + if (!global_conn) { + global_conn = std::make_unique(CHECK_RESULT(Connect())); + } + return global_conn.get(); } - virtual int NumIterations() { - return RegularBuildVsSanitizers(10, 5); + bool IsColocated() const { + return std::get<0>(GetParam()); } - virtual std::string database() { - return "yugabyte"; + bool IsPartitioned() const { + return std::get<1>(GetParam()); } - private: - Status StressTest(); + std::string ErrorProbability() const { + return std::get<2>(GetParam()); + } Status TestDdl(const std::vector& ddl, const int iteration); @@ -83,136 +99,75 @@ class PgDdlAtomicityStressTest : public PgDdlAtomicityTestBase { Status TestDml(const int num_iterations); template - Result ExecuteFormatWithRetry(const std::string& format, Args&&... args) { - return DoExecuteWithRetry(Format(format, std::forward(args)...)); + Result ExecuteFormatWithRetry(PGConn* conn, const std::string& format, Args&&... args) { + return DoExecuteWithRetry(conn, Format(format, std::forward(args)...)); } - Result DoExecuteWithRetry(const std::string& stmt); + Result DoExecuteWithRetry(PGConn* conn, const std::string& stmt); Status InsertTestData(const int num_rows); + + private: + std::unique_ptr global_conn; }; Status PgDdlAtomicityStressTest::SetupTables() { - auto conn = VERIFY_RESULT(Connect()); - return conn.Execute(CreateTableStmt(kTable)); + if (IsColocated()) { + // We need a separate connection to create the colocated database, before we can + // connect to it via GetGlobalConn(). + auto conn_init = VERIFY_RESULT(LibPqTestBase::Connect()); + RETURN_NOT_OK(conn_init.ExecuteFormat("CREATE DATABASE $0 WITH colocated = true", database())); + } + auto global_conn = GetGlobalConn(); + if (IsPartitioned()) { + RETURN_NOT_OK(global_conn->ExecuteFormat( + "CREATE TABLE $0 (key INT PRIMARY KEY, value TEXT, num real) " + " PARTITION BY RANGE(key)", kTable)); + RETURN_NOT_OK(global_conn->ExecuteFormat( + "CREATE TABLE $0_$1 PARTITION OF $0 FOR VALUES FROM ($1) TO ($2)", + kTable, 1, NumIterations())); + // Create a default partition. + return global_conn->ExecuteFormat("CREATE TABLE $0_default PARTITION OF $0 DEFAULT", kTable); + } + return global_conn->Execute(CreateTableStmt(kTable)); } Status PgDdlAtomicityStressTest::InsertTestData(const int num_rows) { - auto conn = VERIFY_RESULT(Connect()); - return conn.ExecuteFormat( + auto global_conn = GetGlobalConn(); + return global_conn->ExecuteFormat( "INSERT INTO $0 VALUES (generate_series(1, $1))", kTable, num_rows); } -Status PgDdlAtomicityStressTest::StressTestWithFlag(const std::string& error_probability) { - RETURN_NOT_OK(SetupTables()); - - RETURN_NOT_OK(InsertTestData(NumIterations() * 2)); - - if (!error_probability.empty()) { - RETURN_NOT_OK(cluster_->SetFlagOnMasters(error_probability, "0.1")); +Result PgDdlAtomicityStressTest::Connect() { + if (IsColocated()) { + return ConnectToDB(database()); } - - RETURN_NOT_OK(StressTest()); - - return Status::OK(); + return LibPqTestBase::Connect(); } -Status PgDdlAtomicityStressTest::StressTest() { - const int num_iterations = NumIterations(); - TestThreadHolder thread_holder; - - // We test creation/deletion together so that we can be sure that the entity we are dropping - // exists when it is executed. - - // Create a thread to add and drop columns. - thread_holder.AddThreadFunctor([this, num_iterations] { - std::vector ddls = { - "ALTER TABLE $0 ADD COLUMN col_$1 TEXT", - "ALTER TABLE $0 DROP COLUMN col_$1" - }; - ASSERT_OK(TestDdl(ddls, num_iterations)); - LOG(INFO) << "Thread to add and drop columns has completed"; - }); - - // Create a thread to add and drop columns with default values. - thread_holder.AddThreadFunctor([this, num_iterations] { - std::vector ddls = { - "ALTER TABLE $0 ADD COLUMN col_def_$1 TEXT DEFAULT 'def'", - "ALTER TABLE $0 DROP COLUMN col_def_$1" - }; - ASSERT_OK(TestDdl(ddls, num_iterations)); - LOG(INFO) << "Thread to add and drop columns with default values has completed"; - }); - - // Create a thread to create/drop an index on this table. - thread_holder.AddThreadFunctor([this, num_iterations] { - std::vector ddls = { - "CREATE INDEX NONCONCURRENTLY non_concurrent_idx_$1 ON $0(key)", - "DROP INDEX non_concurrent_idx_$1" - }; - ASSERT_OK(TestDdl(ddls, num_iterations)); - LOG(INFO) << "Thread to create/drop an index has completed"; - }); - - // ConcurrentIndex is a very long running operation. Cleaning up a failed ConcurrentIndex is - // also a DDL, and this can be a very long running test. Reduce the number of iterations. - thread_holder.AddThreadFunctor([this, num_iterations] { - ASSERT_OK(TestConcurrentIndex(num_iterations / 2)); - LOG(INFO) << "Thread to run concurrent index has completed"; - }); - - // Create a thread to update the rows on this table. - thread_holder.AddThreadFunctor([this, num_iterations] { - ASSERT_OK(TestDml(num_iterations)); - LOG(INFO) << "Thread to update the rows has completed"; - }); - - // Wait for all threads to complete. - thread_holder.JoinAll(); - - LOG(INFO) << "Verify that the table does not contain any additional columns"; - auto client = VERIFY_RESULT(cluster_->CreateClient()); - RETURN_NOT_OK(VerifySchema(client.get(), database(), kTable, {"key", "value", "num"})); - - LOG(INFO) << "Verify that no indexes are present on this table"; - for (int i = 0; i < num_iterations; ++i) { - client::VerifyTableNotExists(client.get(), database(), Format("idx_$0", i), 10); - } - - LOG(INFO) << "Verify that all the rows on this table are updated correctly"; - auto conn = VERIFY_RESULT(Connect()); - for (int i = 1; i <= num_iterations; ++i) { - auto res = VERIFY_RESULT(conn.FetchFormat("SELECT value FROM $0 WHERE key = $1", kTable, i)); - auto num_rows = PQntuples(res.get()); - if (num_rows != 1) { - return STATUS_FORMAT(Corruption, "Expected 1 rows for key $0, found $1", i, num_rows); - } - - if (int num_cols = PQnfields(res.get()) != 1) { - return STATUS_FORMAT(Corruption, "got unexpected number of columns: $0", num_cols); - } - - auto expected_val = Format("value_$0", i); - std::string val = VERIFY_RESULT(GetValue(res.get(), 0, 0)); - if (val != expected_val) { - return STATUS_FORMAT(Corruption, "Expected to get $0 for key $1 but got $2", - expected_val, i, val); - } +int PgDdlAtomicityStressTest::NumIterations() { + if (IsPartitioned()) { + // Fewer iterations are sufficient for partitioned table tests because each DDL statement + // internally invokes (num_partitions + 1) DDLs. + return 3; } - LOG(INFO) << __FUNCTION__ << " done"; - return Status::OK(); + return RegularBuildVsSanitizers(10, 5); } +std::string PgDdlAtomicityStressTest::database() { + return IsColocated() ? "yugabyte_colocated" : "yugabyte"; +} Status PgDdlAtomicityStressTest::TestDdl( const std::vector& ddls, const int num_iterations) { + auto conn = VERIFY_RESULT(Connect()); for (int i = 0; i < num_iterations; ++i) { for (const auto& ddl : ddls) { auto stmt = Format(ddl, kTable, i); LOG(INFO) << "Executing stmt " << stmt; - while (!VERIFY_RESULT(DoExecuteWithRetry(stmt))) { + while (!VERIFY_RESULT(DoExecuteWithRetry(&conn, stmt))) { LOG(INFO) << "Retry executing stmt " << stmt; } } @@ -220,9 +175,8 @@ Status PgDdlAtomicityStressTest::TestDdl( return Status::OK(); } -Result PgDdlAtomicityStressTest::DoExecuteWithRetry(const std::string& stmt) { - auto conn = VERIFY_RESULT(Connect()); - auto s = conn.Execute(stmt); +Result PgDdlAtomicityStressTest::DoExecuteWithRetry(PGConn* conn, const std::string& stmt) { + auto s = conn->Execute(stmt); if (s.ok()) { LOG(INFO) << "Execution of stmt " << stmt << " succeeded"; return true; @@ -240,6 +194,7 @@ Result PgDdlAtomicityStressTest::DoExecuteWithRetry(const std::string& stm "Flush: Value write after transaction start"sv, "Injected random failure for testing"sv, "expired or aborted by a conflict"sv, + "current transaction is expired or aborted"sv, "schema version mismatch for table"sv, "marked for deletion in table"sv, "Invalid column number"sv, @@ -270,22 +225,23 @@ Result PgDdlAtomicityStressTest::DoExecuteWithRetry(const std::string& stm } Status PgDdlAtomicityStressTest::TestConcurrentIndex(const int num_iterations) { + auto conn = VERIFY_RESULT(Connect()); for (int i = 0; i < num_iterations; ++i) { bool index_created = false; while (!index_created) { // If concurrent index creation fails, it does not clean up the invalid index. Thus to // make the statement idempotent, drop the index if the create index failed before retrying. index_created = VERIFY_RESULT(ExecuteFormatWithRetry( - "CREATE INDEX idx_$0 ON $1(key)", i, kTable)); + &conn, "CREATE INDEX idx_$0 ON $1(key)", i, kTable)); if (!index_created) { auto stmt = Format("DROP INDEX IF EXISTS idx_$0", i); - while (!VERIFY_RESULT(ExecuteFormatWithRetry(stmt))) { + while (!VERIFY_RESULT(ExecuteFormatWithRetry(&conn, stmt))) { LOG(INFO) << "Retry executing stmt " << stmt; } } } auto stmt = Format("DROP INDEX idx_$0", i); - while (!VERIFY_RESULT(ExecuteFormatWithRetry(stmt))) { + while (!VERIFY_RESULT(ExecuteFormatWithRetry(&conn, stmt))) { LOG(INFO) << "Retry executing stmt " << stmt; } } @@ -296,119 +252,112 @@ Status PgDdlAtomicityStressTest::TestDml(const int num_iterations) { auto conn = VERIFY_RESULT(Connect()); for (int i = 1; i <= num_iterations;) { if (VERIFY_RESULT(ExecuteFormatWithRetry( - "UPDATE $0 SET value = 'value_$1' WHERE key = $1", kTable, i))) { + &conn, "UPDATE $0 SET value = 'value_$1' WHERE key = $1", kTable, i))) { ++i; } } return Status::OK(); } -TEST_F(PgDdlAtomicityStressTest, BasicTest) { - ASSERT_OK(StressTestWithFlag("")); -} - -TEST_F(PgDdlAtomicityStressTest, TestTxnVerificationFailure) { - ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_transaction_verification_failure_probability")); -} -TEST_F(PgDdlAtomicityStressTest, TestFailCatalogWrites) { - ASSERT_OK(StressTestWithFlag( - "TEST_ysql_fail_probability_of_catalog_writes_by_ddl_verification")); -} +INSTANTIATE_TEST_CASE_P( + PgDdlAtomicityStressTest, + PgDdlAtomicityStressTest, + ::testing::Combine( + ::testing::Values(ColocatedDatabase::kFalse, ColocatedDatabase::kTrue), + ::testing::Values(PartitionedTables::kFalse, PartitionedTables::kTrue), + ::testing::Values("", + "TEST_ysql_ddl_transaction_verification_failure_probability", + "TEST_ysql_fail_probability_of_catalog_writes_by_ddl_verification", + "TEST_ysql_ddl_rollback_failure_probability", + "TEST_ysql_ddl_verification_failure_probability"))); -TEST_F(PgDdlAtomicityStressTest, TestFailDdlRollback) { - ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_rollback_failure_probability")); -} +TEST_P(PgDdlAtomicityStressTest, StressTest) { + ASSERT_OK(SetupTables()); -TEST_F(PgDdlAtomicityStressTest, TestFailDdlVerification) { - ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_verification_failure_probability")); -} + ASSERT_OK(InsertTestData(NumIterations() * 2)); -/* - * Tests on Colocated Tables. -*/ -class PgDdlAtomicityColocatedStressTest : public PgDdlAtomicityStressTest { - virtual Status SetupTables() override; - - virtual std::string database() override { - return "yugabyte_colocated"; - } - - Result Connect() override { - return ConnectToDB(database()); + if (!ErrorProbability().empty()) { + ASSERT_OK(cluster_->SetFlagOnMasters(ErrorProbability(), "0.1")); } -}; -Status PgDdlAtomicityColocatedStressTest::SetupTables() { - auto conn_init = VERIFY_RESULT(LibPqTestBase::Connect()); - RETURN_NOT_OK(conn_init.ExecuteFormat("CREATE DATABASE $0 WITH colocated = true", database())); - return PgDdlAtomicityStressTest::SetupTables(); -} + const int num_iterations = NumIterations(); + TestThreadHolder thread_holder; -TEST_F(PgDdlAtomicityColocatedStressTest, BasicTest) { - ASSERT_OK(StressTestWithFlag("")); -} + // We test creation/deletion together so that we can be sure that the entity we are dropping + // exists when it is executed. Each thread uses its own connection for its entire duration. -TEST_F(PgDdlAtomicityColocatedStressTest, TestTxnVerificationFailure) { - ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_transaction_verification_failure_probability")); -} + // Create a thread to add and drop columns. + thread_holder.AddThreadFunctor([this, num_iterations] { + std::vector ddls = { + "ALTER TABLE $0 ADD COLUMN col_$1 TEXT", + "ALTER TABLE $0 DROP COLUMN col_$1" + }; + ASSERT_OK(TestDdl(ddls, num_iterations)); + LOG(INFO) << "Thread to add and drop columns has completed"; + }); -TEST_F(PgDdlAtomicityColocatedStressTest, TestFailCatalogWrites) { - ASSERT_OK(StressTestWithFlag( - "TEST_ysql_fail_probability_of_catalog_writes_by_ddl_verification")); -} + // Create a thread to add and drop columns with default values. + thread_holder.AddThreadFunctor([this, num_iterations] { + std::vector ddls = { + "ALTER TABLE $0 ADD COLUMN col_def_$1 TEXT DEFAULT 'def'", + "ALTER TABLE $0 DROP COLUMN col_def_$1" + }; + ASSERT_OK(TestDdl(ddls, num_iterations)); + LOG(INFO) << "Thread to add and drop columns with default values has completed"; + }); -TEST_F(PgDdlAtomicityColocatedStressTest, TestFailDdlRollback) { - ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_rollback_failure_probability")); -} + // Create a thread to create/drop an index on this table. + thread_holder.AddThreadFunctor([this, num_iterations] { + std::vector ddls = { + "CREATE INDEX NONCONCURRENTLY non_concurrent_idx_$1 ON $0(key)", + "DROP INDEX non_concurrent_idx_$1" + }; + ASSERT_OK(TestDdl(ddls, num_iterations)); + LOG(INFO) << "Thread to create/drop an index has completed"; + }); -TEST_F(PgDdlAtomicityColocatedStressTest, TestFailDdlVerification) { - ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_verification_failure_probability")); -} + // ConcurrentIndex is a very long running operation. Cleaning up a failed ConcurrentIndex is + // also a DDL, and this can be a very long running test. Reduce the number of iterations. + thread_holder.AddThreadFunctor([this, num_iterations] { + ASSERT_OK(TestConcurrentIndex(num_iterations / 2)); + LOG(INFO) << "Thread to run concurrent index has completed"; + }); -/* - * Tests on Partitioned Tables. -*/ -class PgDdlAtomicityPartitionedTablesStressTest : public PgDdlAtomicityStressTest { - virtual int NumIterations() override { - // Fewer iterations are sufficient for partitioned table tests because each DDL statement - // internally invokes (num_partitions + 1) DDLs. - return 3; - } - virtual Status SetupTables() override; -}; + // Create a thread to update the rows on this table. + thread_holder.AddThreadFunctor([this, num_iterations] { + ASSERT_OK(TestDml(num_iterations)); + LOG(INFO) << "Thread to update the rows has completed"; + }); -Status PgDdlAtomicityPartitionedTablesStressTest::SetupTables() { - auto conn = VERIFY_RESULT(Connect()); - RETURN_NOT_OK(conn.ExecuteFormat("CREATE TABLE $0 (key INT PRIMARY KEY, value TEXT, num real) " - " PARTITION BY RANGE(key)", kTable)); - RETURN_NOT_OK(conn.ExecuteFormat( - "CREATE TABLE $0_$1 PARTITION OF $0 FOR VALUES FROM ($1) TO ($2)", - kTable, 1, NumIterations())); - // Create a default partition. - RETURN_NOT_OK(conn.ExecuteFormat("CREATE TABLE $0_default PARTITION OF $0 DEFAULT", kTable)); - return Status::OK(); -} + // Wait for all threads to complete. + thread_holder.JoinAll(); -TEST_F(PgDdlAtomicityPartitionedTablesStressTest, BasicTest) { - ASSERT_OK(StressTestWithFlag("")); -} + LOG(INFO) << "Verify that the table does not contain any additional columns"; + auto client = ASSERT_RESULT(cluster_->CreateClient()); + ASSERT_OK(VerifySchema(client.get(), database(), kTable, {"key", "value", "num"})); -TEST_F(PgDdlAtomicityPartitionedTablesStressTest, TestTxnVerificationFailure) { - ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_transaction_verification_failure_probability")); -} + LOG(INFO) << "Verify that no indexes are present on this table"; + for (int i = 0; i < num_iterations; ++i) { + client::VerifyTableNotExists(client.get(), database(), Format("idx_$0", i), 10); + } -TEST_F(PgDdlAtomicityPartitionedTablesStressTest, TestFailCatalogWrites) { - ASSERT_OK(StressTestWithFlag( - "TEST_ysql_fail_probability_of_catalog_writes_by_ddl_verification")); -} + LOG(INFO) << "Verify that all the rows on this table are updated correctly"; + auto global_conn = GetGlobalConn(); + for (int i = 1; i <= num_iterations; ++i) { + auto res = ASSERT_RESULT(global_conn->FetchFormat( + "SELECT value FROM $0 WHERE key = $1", kTable, i)); + auto num_rows = PQntuples(res.get()); + ASSERT_EQ(num_rows, 1); -TEST_F(PgDdlAtomicityPartitionedTablesStressTest, TestFailDdlRollback) { - ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_rollback_failure_probability")); -} + auto num_cols = PQnfields(res.get()); + ASSERT_EQ(num_cols, 1); -TEST_F(PgDdlAtomicityPartitionedTablesStressTest, TestFailDdlVerification) { - ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_verification_failure_probability")); + auto expected_val = Format("value_$0", i); + std::string val = ASSERT_RESULT(GetValue(res.get(), 0, 0)); + ASSERT_EQ(val, expected_val); + } + LOG(INFO) << __FUNCTION__ << " done"; } } // namespace pgwrapper