diff --git a/CHANGELOG.md b/CHANGELOG.md index b69ae41b..229697a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - tools to compute metrics aggregates: - per second rate for counters; - min and max for gauges; + - average for histograms and summaries; ### Changed - Setup cartridge hotreload inside the role diff --git a/doc/monitoring/api_reference.rst b/doc/monitoring/api_reference.rst index 794bd480..c2f17a66 100644 --- a/doc/monitoring/api_reference.rst +++ b/doc/monitoring/api_reference.rst @@ -616,7 +616,9 @@ Metrics functions * ``rate`` for counter collectors: per second rate of value change for the last two observations; * ``min`` for gauge collectors: minimal value for the history of observations; - * ``max`` for gauge collectors: maximal value for the history of observations. + * ``max`` for gauge collectors: maximal value for the history of observations; + * ``average`` for histogram and summary collectors: observations average value + (over all history of observations). :param table output_with_aggregates_prev: a previous result of this method call. Use ``nil`` if this is the first invokation. You may use diff --git a/metrics/aggregates.lua b/metrics/aggregates.lua index e562ee06..1bff473f 100644 --- a/metrics/aggregates.lua +++ b/metrics/aggregates.lua @@ -1,12 +1,19 @@ local string_utils = require('metrics.string_utils') local Counter = require('metrics.collectors.counter') local Gauge = require('metrics.collectors.gauge') +local Histogram = require('metrics.collectors.histogram') +local Summary = require('metrics.collectors.summary') + +-- Otherwise we need to implement different average processors. +assert(Histogram.SUM_SUFFIX == Summary.SUM_SUFFIX) +assert(Histogram.COUNT_SUFFIX == Summary.COUNT_SUFFIX) local mksec_in_sec = 1e6 local RATE_SUFFIX = 'per_second' local MIN_SUFFIX = 'min' local MAX_SUFFIX = 'max' +local AVERAGE_SUFFIX = 'average' local function compute_rate_value(time_delta, obs_prev, obs) if obs_prev == nil then @@ -142,16 +149,73 @@ local function compute_gauge_max(output_with_aggregates_prev, output, coll_key, math.max, MAX_SUFFIX, "Maximum of ") end +local function compute_average_value(sum_obs, count_obs) + -- For each sum there should be count, otherwise info is malformed. + if sum_obs == nil then + return nil + end + + if count_obs.value == 0 then + return { + label_pairs = count_obs.label_pairs, + value = 0, + } + end + + return { + label_pairs = count_obs.label_pairs, + -- Force to float division instead of possible cdata integer division. + value = tonumber(sum_obs.value) / tonumber(count_obs.value), + } +end + +local function compute_collector_average(_, output, _, coll_obs) + local name = string_utils.build_name(coll_obs.name_prefix, AVERAGE_SUFFIX) + local kind = Gauge.kind + local registry_key = string_utils.build_registry_key(name, kind) + + if output[registry_key] ~= nil then + -- If, for any reason, registry collision had happenned, + -- we assume that there is already an aggregate metric with the + -- similar meaning. + return output + end + + local values = {} + + for key, count_obs in pairs(coll_obs.observations[Histogram.COUNT_SUFFIX]) do + local sum_obs = coll_obs.observations[Histogram.SUM_SUFFIX][key] + values[key] = compute_average_value(sum_obs, count_obs) + end + + local metainfo = table.deepcopy(coll_obs.metainfo) + metainfo.aggregate = true + + output[registry_key] = { + name = name, + name_prefix = coll_obs.name_prefix, + help = "Average value (over all time) of " .. coll_obs.name, + kind = kind, + metainfo = metainfo, + timestamp = coll_obs.timestamp, + observations = {[''] = values} + } + + return output +end local default_kind_rules = { [Counter.kind] = { 'rate' }, [Gauge.kind] = { 'min', 'max' }, + [Histogram.kind] = { 'average' }, + [Summary.kind] = { 'average' }, } local rule_processors = { rate = compute_counter_rate, min = compute_gauge_min, max = compute_gauge_max, + average = compute_collector_average, } local function compute(output_with_aggregates_prev, output, kind_rules) diff --git a/test/aggregates_test.lua b/test/aggregates_test.lua index 6c17d230..ddc05496 100644 --- a/test/aggregates_test.lua +++ b/test/aggregates_test.lua @@ -65,6 +65,105 @@ local function get_gauge_example(timestamp, value1, value2) return res end +local function get_histogram_example(timestamp, count_1, sum_1, count_2, sum_2) + return { + http_server_request_latencyhistogram = { + name = 'http_server_request_latency', + name_prefix = 'http_server_request_latency', + kind = 'histogram', + help = 'HTTP server request latency', + metainfo = { default = true }, + timestamp = timestamp, + observations = { + count = { + ["code\t200"] = { + label_pairs = { alias = 'router', code = '200' }, + value = count_1, + }, + ["code\t400"] = { + label_pairs = { alias = 'router', code = '400' }, + value = count_2, + } + }, + sum = { + ["code\t200"] = { + label_pairs = { alias = 'router', code = '200' }, + value = sum_1, + }, + ["code\t400"] = { + label_pairs = { alias = 'router', code = '400' }, + value = sum_2, + } + }, + bucket = { + ["code\t200\tle\t0.1"] = { + label_pairs = { alias = 'router', code = '200', le = '0.1' }, + value = 2064, -- Not used anywhere, so don't fill. + }, + ["code\t200\tle\tinf"] = { + label_pairs = { alias = 'router', code = '200', le = 'inf' }, + value = count_1, + }, + ["code\t400\tle\t0.1"] = { + label_pairs = { alias = 'router', code = '400', le = '0.1' }, + value = 323, -- Not used anywhere, so don't fill. + }, + ["code\t400\tle\ttinf"] = { + label_pairs = { alias = 'router', code = '400', le = 'tinf' }, + value = count_2, + }, + } + } + } + } +end + +local function get_summary_example(timestamp, count_1, sum_1, count_2, sum_2) + return { + http_server_request_latencysummary = { + name = 'http_server_request_latency', + name_prefix = 'http_server_request_latency', + kind = 'summary', + help = 'HTTP server request latency', + metainfo = { default = true }, + timestamp = timestamp, + observations = { + count = { + ["code\t200"] = { + label_pairs = { alias = 'router', code = '200' }, + value = count_1, + }, + ["code\t400"] = { + label_pairs = { alias = 'router', code = '400' }, + value = count_2, + } + }, + sum = { + ["code\t200"] = { + label_pairs = { alias = 'router', code = '200' }, + value = sum_1, + }, + ["code\t400"] = { + label_pairs = { alias = 'router', code = '400' }, + value = sum_2, + } + }, + [''] = { + ["code\t200\tquantile\t0.5"] = { + label_pairs = { alias = 'router', code = '200', le = '0.5' }, + value = 2064, -- Not used anywhere, so don't fill. + }, + ["code\t400\tquantile\t0.5"] = { + label_pairs = { alias = 'router', code = '400', le = '0.5' }, + value = 323, + }, + } + } + + } + } +end + g.test_unknown_rule = function() local output = get_counter_example(1676364616294847ULL, 14148, 3204) @@ -305,3 +404,43 @@ g.test_gauge_min_max_disabled = function() t.assert_equals(utils.len(output_with_aggregates_2), 1, "No min or max computed due to options") end + +local average_cases = { + histogram = get_histogram_example, + summary = get_summary_example, +} + +for k, generator in pairs(average_cases) do + g['test_' .. k .. '_average'] = function() + local output = generator(1676364616294847ULL, 20000, 10000, 1000, 150) + + local output_with_aggregates = metrics.compute_aggregates(nil, output) + t.assert_equals(utils.len(output_with_aggregates), 2, + "Average computed for a single observation") + + local average_obs = output_with_aggregates['http_server_request_latency_averagegauge'] + t.assert_not_equals(average_obs, nil, "Average computed") + t.assert_equals(average_obs.name, 'http_server_request_latency_average') + t.assert_equals(average_obs.name_prefix, 'http_server_request_latency') + t.assert_equals(average_obs.kind, 'gauge') + t.assert_equals(average_obs.help, 'Average value (over all time) of http_server_request_latency') + t.assert_equals(average_obs.metainfo.default, true) + t.assert_equals(average_obs.metainfo.aggregate, true) + t.assert_equals(average_obs.timestamp, 1676364616294847ULL) + t.assert_equals(average_obs.observations['']['code\t200'].label_pairs, + { alias = 'router', code = '200' }) + t.assert_almost_equals(average_obs.observations['']['code\t200'].value, 10000 / 20000) + t.assert_equals(average_obs.observations['']['code\t400'].label_pairs, + { alias = 'router', code = '400' }) + t.assert_almost_equals(average_obs.observations['']['code\t400'].value, 150 / 1000) + end + + g['test_' .. k .. '_average_disabled'] = function() + local output = generator(1676364616294847ULL, 20000, 10000, 1000, 150) + + local opts = {[k] = {}} + local output_with_aggregates = metrics.compute_aggregates(nil, output, opts) + t.assert_equals(utils.len(output_with_aggregates), 1, + "No average computed due to options") + end +end