Skip to content

Commit

Permalink
failover: add failover_primary_timeout option
Browse files Browse the repository at this point in the history
This was previously hardcoded to 31 seconds (hardcoded retry_timout +
1). This may be too short period under some circumstances.

When we retry primary server we drop connection to the backup server and
if the primary server is not yet available (and there are many
unavailable primary servers) we may go through a long timeout cycle
every half minute.

This patch makes the value configurable.

:config: Added `failover_primary_timout` configuration option. This
  can be used to configure how often SSSD tries to reconnect to a
  primary server after a successful connection to a backup server.
  This was previously hardcoded to 31 seconds which is kept as
  the default value.

Resolves: #7375

Reviewed-by: Alexey Tikhonov <[email protected]>
Reviewed-by: Iker Pedrosa <[email protected]>
(cherry picked from commit e9738e3)
(cherry picked from commit 14f32f6)
  • Loading branch information
pbrezina authored and alexey-tikhonov committed Nov 15, 2024
1 parent c012186 commit f5bfcb9
Show file tree
Hide file tree
Showing 10 changed files with 110 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/config/SSSDConfig/sssdoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,8 @@ def __init__(self):
'dns_resolver_op_timeout': _('How long should keep trying to resolve single DNS query (seconds)'),
'dns_resolver_timeout': _('How long to wait for replies from DNS when resolving servers (seconds)'),
'dns_discovery_domain': _('The domain part of service discovery DNS query'),
'failover_primary_timeout': _('How often SSSD tries to reconnect to the primary server after a successful '
'connection to the backup server.'),
'override_gid': _('Override GID value from the identity provider with this value'),
'case_sensitive': _('Treat usernames as case sensitive'),
'entry_cache_user_timeout': _('Entry cache timeout length (seconds)'),
Expand Down
2 changes: 2 additions & 0 deletions src/config/SSSDConfigTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,7 @@ def testListOptions(self):
'dns_resolver_op_timeout',
'dns_resolver_timeout',
'dns_discovery_domain',
'failover_primary_timeout',
'dyndns_update',
'dyndns_ttl',
'dyndns_iface',
Expand Down Expand Up @@ -939,6 +940,7 @@ def testRemoveProvider(self):
'dns_resolver_op_timeout',
'dns_resolver_timeout',
'dns_discovery_domain',
'failover_primary_timeout',
'dyndns_update',
'dyndns_ttl',
'dyndns_iface',
Expand Down
1 change: 1 addition & 0 deletions src/config/cfg_rules.ini
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,7 @@ option = dns_resolver_op_timeout
option = dns_resolver_timeout
option = dns_resolver_use_search_list
option = dns_discovery_domain
option = failover_primary_timeout
option = override_gid
option = case_sensitive
option = override_homedir
Expand Down
1 change: 1 addition & 0 deletions src/config/etc/sssd.api.conf
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ dns_resolver_server_timeout = int, None, false
dns_resolver_op_timeout = int, None, false
dns_resolver_timeout = int, None, false
dns_discovery_domain = str, None, false
failover_primary_timeout = int, None, false
override_gid = int, None, false
case_sensitive = str, None, false
override_homedir = str, None, false
Expand Down
19 changes: 19 additions & 0 deletions src/man/sssd.conf.5.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3773,6 +3773,25 @@ pam_gssapi_indicators_map = sudo:pkinit, sudo-i:pkinit
</listitem>
</varlistentry>

<varlistentry>
<term>failover_primary_timeout (integer)</term>
<listitem>
<para>
When no primary server is currently available,
SSSD fail overs to a backup server. This option
defines the amount of time (in seconds) to
wait before SSSD tries to reconnect to a primary
server again.
</para>
<para>
Note: The minimum value is 31.
</para>
<para>
Default: 31
</para>
</listitem>
</varlistentry>

<varlistentry>
<term>override_gid (integer)</term>
<listitem>
Expand Down
1 change: 1 addition & 0 deletions src/providers/data_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ enum dp_res_opts {
DP_RES_OPT_RESOLVER_SERVER_TIMEOUT,
DP_RES_OPT_RESOLVER_USE_SEARCH_LIST,
DP_RES_OPT_DNS_DOMAIN,
DP_RES_OPT_FAILOVER_PRIMARY_TIMEOUT,

DP_RES_OPTS /* attrs counter */
};
Expand Down
14 changes: 12 additions & 2 deletions src/providers/data_provider_fo.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,20 @@ static int be_fo_get_options(struct be_ctx *ctx,
DP_RES_OPT_RESOLVER_TIMEOUT);
opts->use_search_list = dp_opt_get_bool(ctx->be_res->opts,
DP_RES_OPT_RESOLVER_USE_SEARCH_LIST);
opts->primary_timeout = dp_opt_get_int(ctx->be_res->opts,
DP_RES_OPT_FAILOVER_PRIMARY_TIMEOUT);

opts->retry_timeout = 30;
opts->srv_retry_neg_timeout = 15;
opts->family_order = ctx->be_res->family_order;

if (opts->primary_timeout <= opts->retry_timeout) {
opts->primary_timeout = opts->retry_timeout + 1;
DEBUG(SSSDBG_CONF_SETTINGS,
"Warning: failover_primary_timeout is too low, using %lu "
"seconds instead\n", opts->primary_timeout);
}

return EOK;
}

Expand Down Expand Up @@ -551,7 +561,7 @@ static void be_resolve_server_done(struct tevent_req *subreq)
struct tevent_req);
struct be_resolve_server_state *state = tevent_req_data(req,
struct be_resolve_server_state);
time_t timeout = fo_get_service_retry_timeout(state->svc->fo_service) + 1;
time_t timeout = fo_get_primary_retry_timeout(state->svc->fo_service);
int ret;

ret = be_resolve_server_process(subreq, state, &new_subreq);
Expand All @@ -564,7 +574,6 @@ static void be_resolve_server_done(struct tevent_req *subreq)
}

if (!fo_is_server_primary(state->srv)) {
/* FIXME: make the timeout configurable */
ret = be_primary_server_timeout_activate(state->ctx, state->ev,
state->ctx, state->svc,
timeout);
Expand Down Expand Up @@ -871,6 +880,7 @@ static struct dp_option dp_res_default_opts[] = {
{ "dns_resolver_server_timeout", DP_OPT_NUMBER, { .number = 1000 }, NULL_NUMBER },
{ "dns_resolver_use_search_list", DP_OPT_BOOL, BOOL_TRUE, BOOL_TRUE },
{ "dns_discovery_domain", DP_OPT_STRING, NULL_STRING, NULL_STRING },
{ "failover_primary_timeout", DP_OPT_NUMBER, { .number = 31 }, NULL_NUMBER },
DP_OPTION_TERMINATOR
};

Expand Down
10 changes: 10 additions & 0 deletions src/providers/fail_over.c
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ fo_context_init(TALLOC_CTX *mem_ctx, struct fo_options *opts)

ctx->opts->srv_retry_neg_timeout = opts->srv_retry_neg_timeout;
ctx->opts->retry_timeout = opts->retry_timeout;
ctx->opts->primary_timeout = opts->primary_timeout;
ctx->opts->family_order = opts->family_order;
ctx->opts->service_resolv_timeout = opts->service_resolv_timeout;
ctx->opts->use_search_list = opts->use_search_list;
Expand Down Expand Up @@ -1740,6 +1741,15 @@ time_t fo_get_service_retry_timeout(struct fo_service *svc)
return svc->ctx->opts->retry_timeout;
}

time_t fo_get_primary_retry_timeout(struct fo_service *svc)
{
if (svc == NULL || svc->ctx == NULL || svc->ctx->opts == NULL) {
return 0;
}

return svc->ctx->opts->primary_timeout;
}

bool fo_get_use_search_list(struct fo_server *server)
{
if (
Expand Down
3 changes: 3 additions & 0 deletions src/providers/fail_over.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ struct fo_server;
struct fo_options {
time_t srv_retry_neg_timeout;
time_t retry_timeout;
time_t primary_timeout;
int service_resolv_timeout;
bool use_search_list;
enum restrict_family family_order;
Expand Down Expand Up @@ -211,6 +212,8 @@ int fo_is_srv_lookup(struct fo_server *s);

time_t fo_get_service_retry_timeout(struct fo_service *svc);

time_t fo_get_primary_retry_timeout(struct fo_service *svc);

bool fo_get_use_search_list(struct fo_server *server);

void fo_reset_services(struct fo_ctx *fo_ctx);
Expand Down
59 changes: 59 additions & 0 deletions src/tests/system/tests/test_failover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
SSSD Failover tests.
:requirement: Failover
"""

from __future__ import annotations

import pytest
from sssd_test_framework.roles.client import Client
from sssd_test_framework.roles.ldap import LDAP
from sssd_test_framework.topology import KnownTopology


@pytest.mark.parametrize("value, expected", [(None, 31), (15, 31), (60, 60)])
@pytest.mark.importance("low")
@pytest.mark.ticket(gh=7375, jira="RHEL-17659")
@pytest.mark.topology(KnownTopology.LDAP)
def test_failover__retry_primary(client: Client, ldap: LDAP, value: int | None, expected: int):
"""
:title: Primary server reactivation timeout is respected
:setup:
1. Create LDAP user "user-1"
2. Set failover_primary_timeout to @value
3. Set ldap_uri to invalid, not working server
4. Set ldap_backup_uri to working server
5. Start SSSD
:steps:
1. Lookup user-1
2. Check that SSSD is connected to backup server
3. Find "Primary server reactivation timeout set to @expected seconds" in domain logs
:expectedresults:
1. SSSD failover to backup server
2. SSSD is indeed connected to the backup server
3. String is found
:customerscenario: True
"""
ldap.user("user-1").add()

if value is not None:
client.sssd.domain["failover_primary_timeout"] = str(value)

client.sssd.enable_responder("ifp")
client.sssd.domain["ldap_uri"] = "ldap://ldap.invalid"
client.sssd.domain["ldap_backup_uri"] = f"ldap://{ldap.host.hostname}"
client.sssd.start()

# Lookup user to make sure SSSD did correctly failover to backup server
result = client.tools.id("user-1")
assert result is not None

# Check that SSSD is indeed connected to backup server
assert client.sssd.default_domain is not None
status = client.sssctl.domain_status(client.sssd.default_domain, active=True)
assert ldap.host.hostname in status.stdout

# Check that primary server reactivation timeout was correctly created
log = client.fs.read(client.sssd.logs.domain())
assert f"Primary server reactivation timeout set to {expected} seconds" in log

0 comments on commit f5bfcb9

Please sign in to comment.