From b7250b0f51a7534eb96b28f9182009ee0b5b4b00 Mon Sep 17 00:00:00 2001 From: Georgy Moiseev Date: Thu, 4 Jul 2024 13:16:54 +0300 Subject: [PATCH] metrics: introduce config status gauge The approach used to represent enum metric here is similar to one discussed in [1, 2]. It allows to support a new status later, if required. For example, one can visualize it with hack like [3]. 1. https://github.com/prometheus/client_python/issues/416 2. https://github.com/open-telemetry/opentelemetry-specification/issues/1711 3. https://stackoverflow.com/a/75761900/11646599 Part of tarantool/grafana-dashboard#224 --- CHANGELOG.md | 1 + doc/monitoring/metrics_reference.rst | 19 +++++ metrics/tarantool/config.lua | 30 ++++++++ test/tarantool/config_metrics_test.lua | 98 ++++++++++++++++++++++---- 4 files changed, 136 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index acf59f63..94ac21f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - New Tarantool 3 metrics: - tnt_config_alerts + - tnt_config_status ## [1.1.0] - 2024-05-17 ### Added diff --git a/doc/monitoring/metrics_reference.rst b/doc/monitoring/metrics_reference.rst index d5f1672e..0f35c856 100644 --- a/doc/monitoring/metrics_reference.rst +++ b/doc/monitoring/metrics_reference.rst @@ -1011,4 +1011,23 @@ These metrics are available starting from Tarantool 3.0. - Count of current instance :ref:`configuration apply alerts `. ``{level="warn"}`` label covers warnings and ``{level="error"}`` covers errors. + + * - ``tnt_config_status`` + - The status of current instance :ref:`configuration apply `. + ``status`` label contains possible status name. + Current status has metric value ``1``, inactive statuses have metric value ``0``. + + .. code-block:: none + + # HELP tnt_config_status Tarantool 3 configuration status + # TYPE tnt_config_status gauge + tnt_config_status{status="reload_in_progress",alias="router-001-a"} 0 + tnt_config_status{status="uninitialized",alias="router-001-a"} 0 + tnt_config_status{status="check_warnings",alias="router-001-a"} 0 + tnt_config_status{status="ready",alias="router-001-a"} 1 + tnt_config_status{status="check_errors",alias="router-001-a"} 0 + tnt_config_status{status="startup_in_progress",alias="router-001-a"} 0 + + For example, this set of metrics means that current configuration + for ``router-001-a`` status is ``ready``. \ No newline at end of file diff --git a/metrics/tarantool/config.lua b/metrics/tarantool/config.lua index 3656e2e9..570a8e1c 100644 --- a/metrics/tarantool/config.lua +++ b/metrics/tarantool/config.lua @@ -16,6 +16,23 @@ local function get_config_alerts(config_info) return config_alerts end +local function get_config_status(config_info) + -- See state diagram here + -- https://github.com/tarantool/doc/issues/3544#issuecomment-1866033480 + local config_status = { + uninitialized = 0, + startup_in_progress = 0, + reload_in_progress = 0, + check_warnings = 0, + check_errors = 0, + ready = 0, + } + + config_status[config_info.status] = 1 + + return config_status +end + local function update() if not utils.is_tarantool3() then return @@ -38,6 +55,19 @@ local function update() {default = true} ) end + + local config_status = get_config_status(config_info) + + for status, value in pairs(config_status) do + collectors_list.config_status = utils.set_gauge( + 'config_status', + 'Tarantool 3 configuration status', + value, + {status = status}, + nil, + {default = true} + ) + end end return { diff --git a/test/tarantool/config_metrics_test.lua b/test/tarantool/config_metrics_test.lua index eec535f7..f3f4f99e 100644 --- a/test/tarantool/config_metrics_test.lua +++ b/test/tarantool/config_metrics_test.lua @@ -123,51 +123,122 @@ local function assert_config_alerts_metrics(server, expected_values) t.assert_equals(errors.value, expected_values['error']) end +local function assert_config_status_metrics(server, expected_values) + local observations = server:exec(function() + local metrics = require('metrics') + metrics.invoke_callbacks() + return metrics.collect() + end) -g.before_test('test_config_alerts_if_healthy', start_server) -g.after_test('test_config_alerts_if_healthy', stop_server) + for status, expected_value in pairs(expected_values) do + local actual_obs = utils.find_obs( + 'tnt_config_status', + {status = status, alias = 'server-001-a'}, + observations + ) + t.assert_equals(actual_obs.value, expected_value, + ("got expected value for %q"):format(status)) + end +end -g.test_config_alerts_if_healthy = function(cg) + +g.before_test('test_config_metrics_if_healthy', start_server) +g.after_test('test_config_metrics_if_healthy', stop_server) + +g.test_config_metrics_if_healthy = function(cg) assert_config_alerts_metrics(cg.server, {warn = 0, error = 0}) + assert_config_status_metrics(cg.server, { + uninitialized = 0, + startup_in_progress = 0, + reload_in_progress = 0, + check_warnings = 0, + check_errors = 0, + ready = 1, + }) end -g.before_test('test_config_alerts_if_minor_trouble', start_server) -g.after_test('test_config_alerts_if_minor_trouble', stop_server) +g.before_test('test_config_metrics_if_minor_trouble', start_server) +g.after_test('test_config_metrics_if_minor_trouble', stop_server) -g.test_config_alerts_if_minor_trouble = function(cg) +g.test_config_metrics_if_minor_trouble = function(cg) local config = table.deepcopy(default_config) config['credentials']['users']['user_one'] = {roles = {'role_two'}} reload_config(cg, config) assert_config_alerts_metrics(cg.server, {warn = 1, error = 0}) + assert_config_status_metrics(cg.server, { + uninitialized = 0, + startup_in_progress = 0, + reload_in_progress = 0, + check_warnings = 1, + check_errors = 0, + ready = 0, + }) end -g.before_test('test_config_alerts_if_critical_failure', start_server) -g.after_test('test_config_alerts_if_critical_failure', stop_server) +g.before_test('test_config_metrics_if_critical_failure', start_server) +g.after_test('test_config_metrics_if_critical_failure', stop_server) -g.test_config_alerts_if_critical_failure = function(cg) +g.test_config_metrics_if_critical_failure = function(cg) local config = table.deepcopy(default_config) config['groups']['servers'] = {} reload_config(cg, config) assert_config_alerts_metrics(cg.server, {warn = 0, error = 1}) + assert_config_status_metrics(cg.server, { + uninitialized = 0, + startup_in_progress = 0, + reload_in_progress = 0, + check_warnings = 0, + check_errors = 1, + ready = 0, + }) end -g.before_test('test_config_alerts_if_unsupported', function(cg) +g.before_test('test_config_metrics_if_uninitialized', function(cg) + t.skip_if(not utils.is_tarantool_3_config_supported(), + 'Skip since Tarantool 3 config is unsupported') + utils.create_server(cg) + cg.server:exec(function() + -- Config do it by default: + -- https://github.com/tarantool/tarantool/blob/319357d5973d15d08b8eda6a230eada08b710802/src/box/lua/config/applier/box_cfg.lua#L614 + box.cfg{metrics = {labels = {alias = 'server-001-a'}}} + end) +end) + +g.after_test('test_config_metrics_if_uninitialized', function(cg) + utils.drop_server(cg) + cg.server = nil +end) + +g.test_config_metrics_if_uninitialized = function(cg) + assert_config_alerts_metrics(cg.server, {warn = 0, error = 0}) + assert_config_status_metrics(cg.server, { + uninitialized = 1, + startup_in_progress = 0, + reload_in_progress = 0, + check_warnings = 0, + check_errors = 0, + ready = 0, + }) +end + + +g.before_test('test_config_metrics_if_unsupported', function(cg) t.skip_if(utils.is_tarantool_3_config_supported(), 'Skip since Tarantool 3 config is supported') utils.create_server(cg) end) -g.after_test('test_config_alerts_if_unsupported', function(cg) +g.after_test('test_config_metrics_if_unsupported', function(cg) utils.drop_server(cg) cg.server = nil end) -g.test_config_alerts_if_unsupported = function(cg) +g.test_config_metrics_if_unsupported = function(cg) local observations = cg.server:exec(function() local metrics = require('metrics') metrics.invoke_callbacks() @@ -176,4 +247,7 @@ g.test_config_alerts_if_unsupported = function(cg) local alerts = utils.find_metric('tnt_config_alerts', observations) t.assert_equals(alerts, nil) + + local status = utils.find_metric('tnt_config_status', observations) + t.assert_equals(status, nil) end