Skip to content

Commit

Permalink
metrics: introduce config status gauge
Browse files Browse the repository at this point in the history
The approach used to represent enum metric here is similar to one
discussed in [1, 2]. It allows to support a new status later, if
required. For example, one can visualize it with hack like [3].

1. prometheus/client_python#416
2. open-telemetry/opentelemetry-specification#1711
3. https://stackoverflow.com/a/75761900/11646599

Part of tarantool/grafana-dashboard#224
  • Loading branch information
DifferentialOrange committed Jul 4, 2024
1 parent ef8e7ed commit cb01bec
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- New Tarantool 3 metrics:
- tnt_config_alerts
- tnt_config_status

## [1.1.0] - 2024-05-17
### Added
Expand Down
18 changes: 18 additions & 0 deletions doc/monitoring/metrics_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1011,4 +1011,22 @@ These metrics are available starting from Tarantool 3.0.
- Count of current instance :ref:`configuration apply alerts <_config_api_reference_info>`.
``{level="warn"}`` label covers warnings and
``{level="error"}`` covers errors.

* - ``tnt_config_status``
- The status of current instance :ref:`configuration apply <_config_api_reference_info>`.
``status`` label contains possible status name.
Current status has metric value ``1``, inactive statuses have metric value ``0``.
For example, the following set of metrics means that current configuration
for ``router-001-a`` status is ``ready``.

.. code-block:: none
# HELP tnt_config_status Tarantool 3 configuration status
# TYPE tnt_config_status gauge
tnt_config_status{status="reload_in_progress",alias="router-001-a"} 0
tnt_config_status{status="uninitialized",alias="router-001-a"} 0
tnt_config_status{status="check_warnings",alias="router-001-a"} 0
tnt_config_status{status="ready",alias="router-001-a"} 1
tnt_config_status{status="check_errors",alias="router-001-a"} 0
tnt_config_status{status="startup_in_progress",alias="router-001-a"} 0
30 changes: 30 additions & 0 deletions metrics/tarantool/config.lua
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@ local function get_config_alerts(config_info)
return config_alerts
end

local function get_config_status(config_info)
-- See state diagram here
-- https://github.com/tarantool/doc/issues/3544#issuecomment-1866033480
local config_status = {
uninitialized = 0,
startup_in_progress = 0,
reload_in_progress = 0,
check_warnings = 0,
check_errors = 0,
ready = 0,
}

config_status[config_info.status] = 1

return config_status
end

local function update()
if not utils.is_tarantool3() then
return
Expand All @@ -38,6 +55,19 @@ local function update()
{default = true}
)
end

local config_status = get_config_status(config_info)

for status, value in pairs(config_status) do
collectors_list.config_status = utils.set_gauge(
'config_status',
'Tarantool 3 configuration status',
value,
{status = status},
nil,
{default = true}
)
end
end

return {
Expand Down
98 changes: 86 additions & 12 deletions test/tarantool/config_metrics_test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -123,51 +123,122 @@ local function assert_config_alerts_metrics(server, expected_values)
t.assert_equals(errors.value, expected_values['error'])
end

local function assert_config_status_metrics(server, expected_values)
local observations = server:exec(function()
local metrics = require('metrics')
metrics.invoke_callbacks()
return metrics.collect()
end)

g.before_test('test_config_alerts_if_healthy', start_server)
g.after_test('test_config_alerts_if_healthy', stop_server)
for status, expected_value in pairs(expected_values) do
local actual_obs = utils.find_obs(
'tnt_config_status',
{status = status, alias = 'server-001-a'},
observations
)
t.assert_equals(actual_obs.value, expected_value,
("got expected value for %q"):format(status))
end
end

g.test_config_alerts_if_healthy = function(cg)

g.before_test('test_config_metrics_if_healthy', start_server)
g.after_test('test_config_metrics_if_healthy', stop_server)

g.test_config_metrics_if_healthy = function(cg)
assert_config_alerts_metrics(cg.server, {warn = 0, error = 0})
assert_config_status_metrics(cg.server, {
uninitialized = 0,
startup_in_progress = 0,
reload_in_progress = 0,
check_warnings = 0,
check_errors = 0,
ready = 1,
})
end


g.before_test('test_config_alerts_if_minor_trouble', start_server)
g.after_test('test_config_alerts_if_minor_trouble', stop_server)
g.before_test('test_config_metrics_if_minor_trouble', start_server)
g.after_test('test_config_metrics_if_minor_trouble', stop_server)

g.test_config_alerts_if_minor_trouble = function(cg)
g.test_config_metrics_if_minor_trouble = function(cg)
local config = table.deepcopy(default_config)
config['credentials']['users']['user_one'] = {roles = {'role_two'}}
reload_config(cg, config)

assert_config_alerts_metrics(cg.server, {warn = 1, error = 0})
assert_config_status_metrics(cg.server, {
uninitialized = 0,
startup_in_progress = 0,
reload_in_progress = 0,
check_warnings = 1,
check_errors = 0,
ready = 0,
})
end


g.before_test('test_config_alerts_if_critical_failure', start_server)
g.after_test('test_config_alerts_if_critical_failure', stop_server)
g.before_test('test_config_metrics_if_critical_failure', start_server)
g.after_test('test_config_metrics_if_critical_failure', stop_server)

g.test_config_alerts_if_critical_failure = function(cg)
g.test_config_metrics_if_critical_failure = function(cg)
local config = table.deepcopy(default_config)
config['groups']['servers'] = {}
reload_config(cg, config)

assert_config_alerts_metrics(cg.server, {warn = 0, error = 1})
assert_config_status_metrics(cg.server, {
uninitialized = 0,
startup_in_progress = 0,
reload_in_progress = 0,
check_warnings = 0,
check_errors = 1,
ready = 0,
})
end


g.before_test('test_config_alerts_if_unsupported', function(cg)
g.before_test('test_config_metrics_if_uninitialized', function(cg)
t.skip_if(not utils.is_tarantool_3_config_supported(),
'Skip since Tarantool 3 config is unsupported')
utils.create_server(cg)
cg.server:exec(function()
-- Config do it by default:
-- https://github.com/tarantool/tarantool/blob/319357d5973d15d08b8eda6a230eada08b710802/src/box/lua/config/applier/box_cfg.lua#L614
box.cfg{metrics = {labels = {alias = 'server-001-a'}}}
end)
end)

g.after_test('test_config_metrics_if_uninitialized', function(cg)
utils.drop_server(cg)
cg.server = nil
end)

g.test_config_metrics_if_uninitialized = function(cg)
assert_config_alerts_metrics(cg.server, {warn = 0, error = 0})
assert_config_status_metrics(cg.server, {
uninitialized = 1,
startup_in_progress = 0,
reload_in_progress = 0,
check_warnings = 0,
check_errors = 0,
ready = 0,
})
end


g.before_test('test_config_metrics_if_unsupported', function(cg)
t.skip_if(utils.is_tarantool_3_config_supported(),
'Skip since Tarantool 3 config is supported')
utils.create_server(cg)
end)

g.after_test('test_config_alerts_if_unsupported', function(cg)
g.after_test('test_config_metrics_if_unsupported', function(cg)
utils.drop_server(cg)
cg.server = nil
end)

g.test_config_alerts_if_unsupported = function(cg)
g.test_config_metrics_if_unsupported = function(cg)
local observations = cg.server:exec(function()
local metrics = require('metrics')
metrics.invoke_callbacks()
Expand All @@ -176,4 +247,7 @@ g.test_config_alerts_if_unsupported = function(cg)

local alerts = utils.find_metric('tnt_config_alerts', observations)
t.assert_equals(alerts, nil)

local status = utils.find_metric('tnt_config_status', observations)
t.assert_equals(status, nil)
end

0 comments on commit cb01bec

Please sign in to comment.