From 3f823e098a8b6ac43276d6cd9f512a4d56e9c75c Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Tue, 27 Jul 2021 21:39:03 +0600
Subject: [PATCH] Make server status metrics optional

This commits adds new configuration option - `server_status_metrics_enabled` to
enable new RADIUS server status metrics.

This was done to avoid situation when eradius is used without prometheus library
so creation/usage of this metric will not be crashed at startup of eradius
application.

The default value of the new configuration option is set to `false` to preserve
backward compatibility.
---
 METRICS.md                     | 12 ++++++++++++
 README.md                      |  5 +++++
 src/eradius.app.src            |  1 +
 src/eradius_client.erl         | 20 +++++++++++++++++---
 test/eradius_metrics_SUITE.erl |  8 ++++++--
 5 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/METRICS.md b/METRICS.md
index 795c5778..3b3c5013 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -77,6 +77,18 @@ _All metrics start with `eradius` prefix and the prefix is not included into tab
 |  client_unknown_type_request_total                 | [$NAME, $IP, $PORT, $CNAME, $CIP]     | counter   |
 |  client_bad_authenticator_request_total            | [$NAME, $IP, $PORT, $CNAME, $CIP]     | counter   |
 
+Besides these metrics RADIUS client also may create optional server status metrics which
+could be enabled via `server_status_metrics_enabled` configuration option. These metrics
+represent active/inactive state of upstream RADIUS servers that RADIUS clients
+send requests to.
+
+If RADIUS server status metrics are enabled following additional metric will be exposed:
+
+| Metric                      | Labels              | Type    |
+|-----------------------------|---------------------|---------|
+| server_status               | [ $IP, $PORT ]      | boolean |
+
+
 ### Labels
 
 Following prometheus labels are used to specify a metric:
diff --git a/README.md b/README.md
index ac82d4fc..bf4a6d85 100644
--- a/README.md
+++ b/README.md
@@ -232,6 +232,8 @@ Example of full configuration with keys which can use in `eradius`:
             {{127, 0, 0, 3}, 1812, <<"secret">>}
         ]}
     ]},
+    {server_status_metrics_enabled, false},
+    {counter_aggregator, false},,
     %% Size of RADIUS receive buffer      
     {recbuf, 8192}
 ]}].
@@ -269,6 +271,7 @@ All pools are configured via:
     %%% ...
 ]}]
 ```
+
 ## Failover Erlang code usage
 In a case when RADIUS proxy (eradius_proxy handler) is not used, a list of RADIUS upstream servers could be passed to the `eradius_client:send_radius_request/3` via options, for example:
 ```erlang
@@ -276,6 +279,8 @@ eradius_client:send_request(Server, Request, [{failover, [{"localhost", 1814, <<
 ```
 If `failover` option was not passed to the client through the options or RADIUS proxy configuration there should not be any performance impact as RADIUS client will try to a RADIUS request to only one RADIUS server that is defined in `eradius_client:send_request/3` options.
 
+For each secondary RADIUS server server status metrics could be enabled via boolean `server_status_metrics_enabled` configuration option.
+
 # Eradius counter aggregator
 The `eradius_counter_aggregator` would go over all nodes in an Erlang cluster and aggregate the counter values from all nodes.  
 Configuration value of `counter_aggregator` can be `true` or `false` where `true` - is enable, `false` - is disable counter aggregator.  
diff --git a/src/eradius.app.src b/src/eradius.app.src
index 2c77f2e3..924907b6 100644
--- a/src/eradius.app.src
+++ b/src/eradius.app.src
@@ -13,6 +13,7 @@
       {resend_timeout, 30000},
       {logging, false},
       {counter_aggregator, false},
+      {server_status_metrics_enabled, false},
       {logfile, "./radius.log"},
       {recbuf, 8192}
    ]},
diff --git a/src/eradius_client.erl b/src/eradius_client.erl
index 87a81460..873dfb04 100644
--- a/src/eradius_client.erl
+++ b/src/eradius_client.erl
@@ -14,7 +14,8 @@
 -export([start_link/0, send_request/2, send_request/3, send_remote_request/3, send_remote_request/4]).
 %% internal
 -export([reconfigure/0, send_remote_request_loop/8, find_suitable_peer/1,
-         restore_upstream_server/1, store_radius_server_from_pool/3]).
+         restore_upstream_server/1, store_radius_server_from_pool/3,
+         init_server_status_metrics/0]).
 
 -behaviour(gen_server).
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]).
@@ -410,7 +411,8 @@ configure(State) ->
 prepare_pools() ->
     ets:new(?MODULE, [ordered_set, public, named_table, {keypos, 1}, {write_concurrency,true}]),
     lists:foreach(fun({_PoolName, Servers}) -> prepare_pool(Servers) end, application:get_env(eradius, servers_pool, [])),
-    lists:foreach(fun(Server) -> store_upstream_servers(Server) end, application:get_env(eradius, servers, [])).
+    lists:foreach(fun(Server) -> store_upstream_servers(Server) end, application:get_env(eradius, servers, [])),
+    init_server_status_metrics().
 
 prepare_pool([]) -> ok;
 prepare_pool([{Addr, Port, _, Opts} | Servers]) ->
@@ -445,7 +447,6 @@ store_upstream_servers(Server) ->
 
 %% private
 store_radius_server_from_pool(Addr, Port, Retries) when is_tuple(Addr) and is_integer(Port) and is_integer(Retries) ->
-    eradius_counter:set_boolean_metric(server_status, [Addr, Port], false),
     ets:insert(?MODULE, {{Addr, Port}, Retries, Retries});
 store_radius_server_from_pool(Addr, _, _) ->
     ?LOG(error, "bad IP address specified in RADIUS servers pool configuration ~p", [Addr]),
@@ -542,6 +543,19 @@ parse_ip(T = {_, _, _, _}) ->
 parse_ip(T = {_, _, _, _, _, _}) ->
     {ok, T}.
 
+init_server_status_metrics() ->
+    case application:get_env(eradius, server_status_metrics_enabled, false) of
+        false ->
+            ok;
+        true ->
+            % That will be called at eradius startup and we must be sure that prometheus
+            % application already started if server status metrics supposed to be used
+            application:ensure_all_started(prometheus),
+            ets:foldl(fun ({{Addr, Port}, _, _}, _Acc) ->
+                eradius_counter:set_boolean_metric(server_status, [Addr, Port], false)
+            end, [], ?MODULE)
+    end.
+
 make_metrics_info(Options, {ServerIP, ServerPort}) ->
     ServerName = proplists:get_value(server_name, Options, undefined),
     ClientName = proplists:get_value(client_name, Options, undefined),
diff --git a/test/eradius_metrics_SUITE.erl b/test/eradius_metrics_SUITE.erl
index 63775ab4..11e2f2c3 100644
--- a/test/eradius_metrics_SUITE.erl
+++ b/test/eradius_metrics_SUITE.erl
@@ -61,7 +61,8 @@ init_per_suite(Config) ->
                      {tables, [dictionary]},
                      {client_ip, {127,0,0,2}},
                      {client_ports, 20},
-                     {counter_aggregator, false}
+                     {counter_aggregator, false},
+                     {server_status_metrics_enabled, true}
                     ],
     [application:set_env(eradius, Key, Value) || {Key, Value} <- EradiusConfig],
     application:set_env(prometheus, collectors, [eradius_prometheus_collector]),
@@ -80,6 +81,10 @@ end_per_suite(_Config) ->
     application:stop(prometheus),
     ok.
 
+init_per_testcase(_, Config) ->
+    eradius_client:init_server_status_metrics(),
+    Config.
+
 %% tests
 good_requests(_Config) ->
     Requests = [{request, access, access_accept},
@@ -126,7 +131,6 @@ check_single_request(bad, EradiusRequestType, _RequestType, _ResponseType) ->
     ok = check_metric(reject_responses_total, [{server_name, bad}], 1),
     ok = check_metric(server_status, true, [eradius_test_handler:localhost(tuple), 1813]);
 check_single_request(error, EradiusRequestType, _RequestType, _ResponseType) ->
-    eradius_client:reconfigure(),
     ok = send_request(EradiusRequestType, eradius_test_handler:localhost(tuple), 1814, ?ATTRS_ERROR,
                       [{server_name, error}, {client_name, test}, {timeout, 1000},
                        {failover, [{eradius_test_handler:localhost(tuple), 1812, ?SECRET}]}]),