Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subdaemon heartbeat with modified libqb (async API for connect) #2588

Merged
merged 3 commits into from
Jan 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -1309,10 +1309,11 @@ PKG_CHECK_MODULES(libqb, libqb >= 0.17)
CPPFLAGS="$libqb_CFLAGS $CPPFLAGS"
LIBS="$libqb_LIBS $LIBS"

dnl libqb libqb-2.0.3 + ipc-connect-async-API (2022-01)
AC_CHECK_FUNCS([qb_ipcc_connect_async])

dnl libqb 2.0.2+ (2020-10)
AC_CHECK_FUNCS(qb_ipcc_auth_get,
AC_DEFINE(HAVE_IPCC_AUTH_GET, 1,
[Have qb_ipcc_auth_get function]))
AC_CHECK_FUNCS([qb_ipcc_auth_get])

dnl libqb 2.0.0+ (2020-05)
CHECK_ENUM_VALUE([qb/qblog.h],[qb_log_conf],[QB_LOG_CONF_MAX_LINE_LEN])
Expand Down Expand Up @@ -1585,9 +1586,7 @@ AS_IF([test $with_corosync -ne $DISABLED],
dnl Shutdown tracking added (back) to corosync Jan 2021
saved_LIBS="$LIBS"
LIBS="$LIBS $COROSYNC_LIBS"
AC_CHECK_FUNCS(corosync_cfg_trackstart,
AC_DEFINE(HAVE_COROSYNC_CFG_TRACKSTART, 1,
[Have corosync_cfg_trackstart function]))
AC_CHECK_FUNCS([corosync_cfg_trackstart])
LIBS="$saved_LIBS"
]
)
Expand Down
2 changes: 2 additions & 0 deletions daemons/pacemakerd/pacemakerd.c
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ main(int argc, char **argv)
pcmk_ipc_api_t *old_instance = NULL;
qb_ipcs_service_t *ipcs = NULL;

subdaemon_check_progress = time(NULL);

crm_log_preinit(NULL, argc, argv);
mainloop_add_signal(SIGHUP, pcmk_ignore);
mainloop_add_signal(SIGQUIT, pcmk_sigquit);
Expand Down
3 changes: 2 additions & 1 deletion daemons/pacemakerd/pacemakerd.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2010-2021 the Pacemaker project contributors
* Copyright 2010-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
Expand All @@ -21,6 +21,7 @@ extern unsigned int shutdown_complete_state_reported_to;
extern gboolean shutdown_complete_state_reported_client_closed;
extern crm_trigger_t *shutdown_trigger;
extern crm_trigger_t *startup_trigger;
extern time_t subdaemon_check_progress;

gboolean mcp_read_config(void);

Expand Down
6 changes: 3 additions & 3 deletions daemons/pacemakerd/pcmkd_messages.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2010-2021 the Pacemaker project contributors
* Copyright 2010-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
Expand All @@ -25,7 +25,6 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
const char *value = NULL;
xmlNode *ping = NULL;
xmlNode *reply = NULL;
time_t pinged = time(NULL);
const char *from = crm_element_value(msg, F_CRM_SYS_FROM);

/* Pinged for status */
Expand All @@ -36,7 +35,8 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
value = crm_element_value(msg, F_CRM_SYS_TO);
crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value);
crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state);
crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) pinged);
crm_xml_add_ll(ping, XML_ATTR_TSTAMP,
(long long) subdaemon_check_progress);
crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok");
reply = create_reply(msg, ping);
free_xml(ping);
Expand Down
139 changes: 91 additions & 48 deletions daemons/pacemakerd/pcmkd_subdaemons.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,16 @@ typedef struct pcmk_child_s {
const char *command;
const char *endpoint; /* IPC server name */
bool needs_cluster;
int check_count;

/* Anything below here will be dynamically initialized */
bool needs_retry;
bool active_before_startup;
} pcmk_child_t;

#define PCMK_PROCESS_CHECK_INTERVAL 5
#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */
#define PCMK_PROCESS_CHECK_INTERVAL 1
#define PCMK_PROCESS_CHECK_RETRIES 5
#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */

/* Index into the array below */
#define PCMK_CHILD_CONTROLD 5
Expand Down Expand Up @@ -82,6 +84,7 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };

crm_trigger_t *shutdown_trigger = NULL;
crm_trigger_t *startup_trigger = NULL;
time_t subdaemon_check_progress = 0;

/* When contacted via pacemakerd-api by a client having sbd in
* the name we assume it is sbd-daemon which wants to know
Expand All @@ -103,7 +106,6 @@ gboolean running_with_sbd = FALSE; /* local copy */
GMainLoop *mainloop = NULL;

static gboolean fatal_error = FALSE;
static bool global_keep_tracking = false;

static gboolean check_active_before_startup_processes(gpointer user_data);
static int child_liveness(pcmk_child_t *child);
Expand All @@ -127,44 +129,94 @@ pcmkd_cluster_connected(void)
static gboolean
check_active_before_startup_processes(gpointer user_data)
{
gboolean keep_tracking = FALSE;

for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
if (!pcmk_children[i].active_before_startup) {
/* we are already tracking it as a child process. */
continue;
} else {
int rc = child_liveness(&pcmk_children[i]);

switch (rc) {
case pcmk_rc_ok:
break;
case pcmk_rc_ipc_unresponsive:
case pcmk_rc_ipc_pid_only: // This case: it was previously OK
if (pcmk_children[i].respawn) {
crm_err("%s[%lld] terminated%s", pcmk_children[i].name,
(long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
(rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
} else {
/* orderly shutdown */
crm_notice("%s[%lld] terminated%s", pcmk_children[i].name,
(long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
(rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
}
pcmk_process_exit(&(pcmk_children[i]));
continue;
default:
crm_exit(CRM_EX_FATAL);
break; /* static analysis/noreturn */
static int next_child = 0;
int rc = child_liveness(&pcmk_children[next_child]);

crm_trace("%s[%lld] checked as %d",
pcmk_children[next_child].name,
(long long) PCMK__SPECIAL_PID_AS_0(
pcmk_children[next_child].pid),
rc);

switch (rc) {
case pcmk_rc_ok:
pcmk_children[next_child].check_count = 0;
next_child++;
subdaemon_check_progress = time(NULL);
break;
case pcmk_rc_ipc_pid_only: // This case: it was previously OK
pcmk_children[next_child].check_count++;
if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) {
crm_err("%s[%lld] is unresponsive to ipc after %d tries but "
"we found the pid so have it killed that we can restart",
pcmk_children[next_child].name,
(long long) PCMK__SPECIAL_PID_AS_0(
pcmk_children[next_child].pid),
pcmk_children[next_child].check_count);
stop_child(&pcmk_children[next_child], SIGKILL);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In public clouds, nowadays it happens more often than before that sub-daemons are unresponsive to IPC and get respawned.

As we know, if it's controller that respawns, the node will lose all its transient attributes in the CIB status without being written again. Not only the resources that rely on the attributes will get impacted, but also missing of the internal attribute #feature-set will result into confusing MIXED-VERSION condition being shown from interfaces like crm_mon.

So far PCMK_fail_fast=yes probably is the only workaround to get the situation back into sanity but of course at a cost of node reboot.

While we've been trying to address it with the idea like:
#1699

, I'm not sure if it'd make sense at all to increase the tolerance here such as PCMK_PROCESS_CHECK_RETRIES or make it configurable... Otherwise should we say that 5 failures in a row are anyway bad enough to trigger a recovery?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sry I may be missing the reason for your comment here.
Previously IPC wasn't checked on a periodic basis for all subdaemons.
Numbers are kind of arbitrary. 1s is kind of a lower limit that makes sense for retries. Failing after 5 retries was the attempt to make it as reactive as before for cases where IPC was checked before already.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing is wrong with the changes in this PR. Just for bringing up the topic in the context here :-)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coincidentally I recently created https://projects.clusterlabs.org/T950 regarding this code, but it's not related unless you've only seen issues at cluster shutdown.

https://projects.clusterlabs.org/T73 is not directly related either but could affect the timing.

There is a 1s delay between checks of all subdaemons, so if they're all up, that's at least 6s between checks for any one subdaemon. 5 tries (30s) does seem plenty of time, so I wouldn't want to raise that. If a cloud host can't get enough cycles in 30s to respond to a check, it's probably unsuitable as an HA node.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the info and opinion. I agree.

if (pcmk_children[next_child].respawn) {
/* as long as the respawn-limit isn't reached
give it another round of check retries
*/
pcmk_children[next_child].check_count = 0;
}
} else {
crm_notice("%s[%lld] is unresponsive to ipc after %d tries",
pcmk_children[next_child].name,
(long long) PCMK__SPECIAL_PID_AS_0(
pcmk_children[next_child].pid),
pcmk_children[next_child].check_count);
if (pcmk_children[next_child].respawn) {
/* as long as the respawn-limit isn't reached
and we haven't run out of connect retries
we account this as progress we are willing
to tell to sbd
*/
subdaemon_check_progress = time(NULL);
}
}
}
/* at least one of the processes found at startup
* is still going, so keep this recurring timer around */
keep_tracking = TRUE;
/* go to the next child and see if
we can make progress there
*/
next_child++;
break;
case pcmk_rc_ipc_unresponsive:
if (pcmk_children[next_child].respawn) {
crm_err("%s[%lld] terminated",
pcmk_children[next_child].name,
(long long) PCMK__SPECIAL_PID_AS_0(
pcmk_children[next_child].pid));
} else {
/* orderly shutdown */
crm_notice("%s[%lld] terminated",
pcmk_children[next_child].name,
(long long) PCMK__SPECIAL_PID_AS_0(
pcmk_children[next_child].pid));
}
pcmk_process_exit(&(pcmk_children[next_child]));
if (!pcmk_children[next_child].respawn) {
/* if a subdaemon is down and we don't want it
to be restarted this is a success during
shutdown. if it isn't restarted anymore
due to MAX_RESPAWN it is
rather no success.
*/
if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
subdaemon_check_progress = time(NULL);
}
next_child++;
}
break;
default:
crm_exit(CRM_EX_FATAL);
break; /* static analysis/noreturn */
}

global_keep_tracking = keep_tracking;
return keep_tracking;
if (next_child >= PCMK__NELEM(pcmk_children)) {
next_child = 0;
}

return G_SOURCE_CONTINUE;
}

static gboolean
Expand Down Expand Up @@ -257,11 +309,6 @@ pcmk_process_exit(pcmk_child_t * child)
child->name, child->endpoint);
/* need to monitor how it evolves, and start new process if badly */
child->active_before_startup = true;
if (!global_keep_tracking) {
global_keep_tracking = true;
g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
check_active_before_startup_processes, NULL);
}

} else {
if (child->needs_cluster && !pcmkd_cluster_connected()) {
Expand Down Expand Up @@ -648,7 +695,6 @@ child_liveness(pcmk_child_t *child)
int
find_and_track_existing_processes(void)
{
bool tracking = false;
bool wait_in_progress;
int rc;
size_t i, rounds;
Expand Down Expand Up @@ -716,7 +762,6 @@ find_and_track_existing_processes(void)
pcmk_children[i].pid));
pcmk_children[i].respawn_count = -1; /* 0~keep watching */
pcmk_children[i].active_before_startup = true;
tracking = true;
break;
case pcmk_rc_ipc_pid_only:
if (pcmk_children[i].respawn_count == WAIT_TRIES) {
Expand Down Expand Up @@ -751,10 +796,8 @@ find_and_track_existing_processes(void)
pcmk_children[i].respawn_count = 0; /* restore pristine state */
}

if (tracking) {
g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
check_active_before_startup_processes, NULL);
}
return pcmk_rc_ok;
}

Expand Down
22 changes: 22 additions & 0 deletions lib/common/ipc_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -1407,13 +1407,35 @@ pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid,
int32_t qb_rc;
pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0;
qb_ipcc_connection_t *c;
#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
struct pollfd pollfd = { 0, };
int poll_rc;

c = qb_ipcc_connect_async(name, 0,
&(pollfd.fd));
#else
c = qb_ipcc_connect(name, 0);
#endif
if (c == NULL) {
crm_info("Could not connect to %s IPC: %s", name, strerror(errno));
rc = pcmk_rc_ipc_unresponsive;
goto bail;
}
#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
pollfd.events = POLLIN;
do {
poll_rc = poll(&pollfd, 1, 2000);
kgaillot marked this conversation as resolved.
Show resolved Hide resolved
} while ((poll_rc == -1) && (errno == EINTR));
if ((poll_rc <= 0) || (qb_ipcc_connect_continue(c) != 0)) {
crm_info("Could not connect to %s IPC: %s", name,
(poll_rc == 0)?"timeout":strerror(errno));
rc = pcmk_rc_ipc_unresponsive;
if (poll_rc > 0) {
c = NULL; // qb_ipcc_connect_continue cleaned up for us
}
goto bail;
}
#endif

qb_rc = qb_ipcc_fd_get(c, &fd);
if (qb_rc != 0) {
Expand Down