Skip to content

Commit

Permalink
Merge pull request #2588 from wenningerk/subdaemon_heartbeat_mod_libqb
Browse files Browse the repository at this point in the history
Subdaemon heartbeat with modified libqb (async API for connect)
  • Loading branch information
kgaillot authored Jan 19, 2022
2 parents b0c2196 + 8e8a4a3 commit 2c937a4
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 58 deletions.
11 changes: 5 additions & 6 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -1309,10 +1309,11 @@ PKG_CHECK_MODULES(libqb, libqb >= 0.17)
CPPFLAGS="$libqb_CFLAGS $CPPFLAGS"
LIBS="$libqb_LIBS $LIBS"

dnl libqb libqb-2.0.3 + ipc-connect-async-API (2022-01)
AC_CHECK_FUNCS([qb_ipcc_connect_async])

dnl libqb 2.0.2+ (2020-10)
AC_CHECK_FUNCS(qb_ipcc_auth_get,
AC_DEFINE(HAVE_IPCC_AUTH_GET, 1,
[Have qb_ipcc_auth_get function]))
AC_CHECK_FUNCS([qb_ipcc_auth_get])

dnl libqb 2.0.0+ (2020-05)
CHECK_ENUM_VALUE([qb/qblog.h],[qb_log_conf],[QB_LOG_CONF_MAX_LINE_LEN])
Expand Down Expand Up @@ -1585,9 +1586,7 @@ AS_IF([test $with_corosync -ne $DISABLED],
dnl Shutdown tracking added (back) to corosync Jan 2021
saved_LIBS="$LIBS"
LIBS="$LIBS $COROSYNC_LIBS"
AC_CHECK_FUNCS(corosync_cfg_trackstart,
AC_DEFINE(HAVE_COROSYNC_CFG_TRACKSTART, 1,
[Have corosync_cfg_trackstart function]))
AC_CHECK_FUNCS([corosync_cfg_trackstart])
LIBS="$saved_LIBS"
]
)
Expand Down
2 changes: 2 additions & 0 deletions daemons/pacemakerd/pacemakerd.c
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ main(int argc, char **argv)
pcmk_ipc_api_t *old_instance = NULL;
qb_ipcs_service_t *ipcs = NULL;

subdaemon_check_progress = time(NULL);

crm_log_preinit(NULL, argc, argv);
mainloop_add_signal(SIGHUP, pcmk_ignore);
mainloop_add_signal(SIGQUIT, pcmk_sigquit);
Expand Down
3 changes: 2 additions & 1 deletion daemons/pacemakerd/pacemakerd.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2010-2021 the Pacemaker project contributors
* Copyright 2010-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
Expand All @@ -21,6 +21,7 @@ extern unsigned int shutdown_complete_state_reported_to;
extern gboolean shutdown_complete_state_reported_client_closed;
extern crm_trigger_t *shutdown_trigger;
extern crm_trigger_t *startup_trigger;
extern time_t subdaemon_check_progress;

gboolean mcp_read_config(void);

Expand Down
6 changes: 3 additions & 3 deletions daemons/pacemakerd/pcmkd_messages.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2010-2021 the Pacemaker project contributors
* Copyright 2010-2022 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
Expand All @@ -25,7 +25,6 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
const char *value = NULL;
xmlNode *ping = NULL;
xmlNode *reply = NULL;
time_t pinged = time(NULL);
const char *from = crm_element_value(msg, F_CRM_SYS_FROM);

/* Pinged for status */
Expand All @@ -36,7 +35,8 @@ pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id)
value = crm_element_value(msg, F_CRM_SYS_TO);
crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value);
crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state);
crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) pinged);
crm_xml_add_ll(ping, XML_ATTR_TSTAMP,
(long long) subdaemon_check_progress);
crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok");
reply = create_reply(msg, ping);
free_xml(ping);
Expand Down
139 changes: 91 additions & 48 deletions daemons/pacemakerd/pcmkd_subdaemons.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,16 @@ typedef struct pcmk_child_s {
const char *command;
const char *endpoint; /* IPC server name */
bool needs_cluster;
int check_count;

/* Anything below here will be dynamically initialized */
bool needs_retry;
bool active_before_startup;
} pcmk_child_t;

#define PCMK_PROCESS_CHECK_INTERVAL 5
#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */
#define PCMK_PROCESS_CHECK_INTERVAL 1
#define PCMK_PROCESS_CHECK_RETRIES 5
#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */

/* Index into the array below */
#define PCMK_CHILD_CONTROLD 5
Expand Down Expand Up @@ -82,6 +84,7 @@ static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };

crm_trigger_t *shutdown_trigger = NULL;
crm_trigger_t *startup_trigger = NULL;
time_t subdaemon_check_progress = 0;

/* When contacted via pacemakerd-api by a client having sbd in
* the name we assume it is sbd-daemon which wants to know
Expand All @@ -103,7 +106,6 @@ gboolean running_with_sbd = FALSE; /* local copy */
GMainLoop *mainloop = NULL;

static gboolean fatal_error = FALSE;
static bool global_keep_tracking = false;

static gboolean check_active_before_startup_processes(gpointer user_data);
static int child_liveness(pcmk_child_t *child);
Expand All @@ -127,44 +129,94 @@ pcmkd_cluster_connected(void)
static gboolean
check_active_before_startup_processes(gpointer user_data)
{
gboolean keep_tracking = FALSE;

for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
if (!pcmk_children[i].active_before_startup) {
/* we are already tracking it as a child process. */
continue;
} else {
int rc = child_liveness(&pcmk_children[i]);

switch (rc) {
case pcmk_rc_ok:
break;
case pcmk_rc_ipc_unresponsive:
case pcmk_rc_ipc_pid_only: // This case: it was previously OK
if (pcmk_children[i].respawn) {
crm_err("%s[%lld] terminated%s", pcmk_children[i].name,
(long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
(rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
} else {
/* orderly shutdown */
crm_notice("%s[%lld] terminated%s", pcmk_children[i].name,
(long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[i].pid),
(rc == pcmk_rc_ipc_pid_only)? " as IPC server" : "");
}
pcmk_process_exit(&(pcmk_children[i]));
continue;
default:
crm_exit(CRM_EX_FATAL);
break; /* static analysis/noreturn */
static int next_child = 0;
int rc = child_liveness(&pcmk_children[next_child]);

crm_trace("%s[%lld] checked as %d",
pcmk_children[next_child].name,
(long long) PCMK__SPECIAL_PID_AS_0(
pcmk_children[next_child].pid),
rc);

switch (rc) {
case pcmk_rc_ok:
pcmk_children[next_child].check_count = 0;
next_child++;
subdaemon_check_progress = time(NULL);
break;
case pcmk_rc_ipc_pid_only: // This case: it was previously OK
pcmk_children[next_child].check_count++;
if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) {
crm_err("%s[%lld] is unresponsive to ipc after %d tries but "
"we found the pid so have it killed that we can restart",
pcmk_children[next_child].name,
(long long) PCMK__SPECIAL_PID_AS_0(
pcmk_children[next_child].pid),
pcmk_children[next_child].check_count);
stop_child(&pcmk_children[next_child], SIGKILL);
if (pcmk_children[next_child].respawn) {
/* as long as the respawn-limit isn't reached
give it another round of check retries
*/
pcmk_children[next_child].check_count = 0;
}
} else {
crm_notice("%s[%lld] is unresponsive to ipc after %d tries",
pcmk_children[next_child].name,
(long long) PCMK__SPECIAL_PID_AS_0(
pcmk_children[next_child].pid),
pcmk_children[next_child].check_count);
if (pcmk_children[next_child].respawn) {
/* as long as the respawn-limit isn't reached
and we haven't run out of connect retries
we account this as progress we are willing
to tell to sbd
*/
subdaemon_check_progress = time(NULL);
}
}
}
/* at least one of the processes found at startup
* is still going, so keep this recurring timer around */
keep_tracking = TRUE;
/* go to the next child and see if
we can make progress there
*/
next_child++;
break;
case pcmk_rc_ipc_unresponsive:
if (pcmk_children[next_child].respawn) {
crm_err("%s[%lld] terminated",
pcmk_children[next_child].name,
(long long) PCMK__SPECIAL_PID_AS_0(
pcmk_children[next_child].pid));
} else {
/* orderly shutdown */
crm_notice("%s[%lld] terminated",
pcmk_children[next_child].name,
(long long) PCMK__SPECIAL_PID_AS_0(
pcmk_children[next_child].pid));
}
pcmk_process_exit(&(pcmk_children[next_child]));
if (!pcmk_children[next_child].respawn) {
/* if a subdaemon is down and we don't want it
to be restarted this is a success during
shutdown. if it isn't restarted anymore
due to MAX_RESPAWN it is
rather no success.
*/
if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
subdaemon_check_progress = time(NULL);
}
next_child++;
}
break;
default:
crm_exit(CRM_EX_FATAL);
break; /* static analysis/noreturn */
}

global_keep_tracking = keep_tracking;
return keep_tracking;
if (next_child >= PCMK__NELEM(pcmk_children)) {
next_child = 0;
}

return G_SOURCE_CONTINUE;
}

static gboolean
Expand Down Expand Up @@ -257,11 +309,6 @@ pcmk_process_exit(pcmk_child_t * child)
child->name, child->endpoint);
/* need to monitor how it evolves, and start new process if badly */
child->active_before_startup = true;
if (!global_keep_tracking) {
global_keep_tracking = true;
g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
check_active_before_startup_processes, NULL);
}

} else {
if (child->needs_cluster && !pcmkd_cluster_connected()) {
Expand Down Expand Up @@ -648,7 +695,6 @@ child_liveness(pcmk_child_t *child)
int
find_and_track_existing_processes(void)
{
bool tracking = false;
bool wait_in_progress;
int rc;
size_t i, rounds;
Expand Down Expand Up @@ -716,7 +762,6 @@ find_and_track_existing_processes(void)
pcmk_children[i].pid));
pcmk_children[i].respawn_count = -1; /* 0~keep watching */
pcmk_children[i].active_before_startup = true;
tracking = true;
break;
case pcmk_rc_ipc_pid_only:
if (pcmk_children[i].respawn_count == WAIT_TRIES) {
Expand Down Expand Up @@ -751,10 +796,8 @@ find_and_track_existing_processes(void)
pcmk_children[i].respawn_count = 0; /* restore pristine state */
}

if (tracking) {
g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL,
check_active_before_startup_processes, NULL);
}
return pcmk_rc_ok;
}

Expand Down
22 changes: 22 additions & 0 deletions lib/common/ipc_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -1407,13 +1407,35 @@ pcmk__ipc_is_authentic_process_active(const char *name, uid_t refuid,
int32_t qb_rc;
pid_t found_pid = 0; uid_t found_uid = 0; gid_t found_gid = 0;
qb_ipcc_connection_t *c;
#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
struct pollfd pollfd = { 0, };
int poll_rc;

c = qb_ipcc_connect_async(name, 0,
&(pollfd.fd));
#else
c = qb_ipcc_connect(name, 0);
#endif
if (c == NULL) {
crm_info("Could not connect to %s IPC: %s", name, strerror(errno));
rc = pcmk_rc_ipc_unresponsive;
goto bail;
}
#ifdef HAVE_QB_IPCC_CONNECT_ASYNC
pollfd.events = POLLIN;
do {
poll_rc = poll(&pollfd, 1, 2000);
} while ((poll_rc == -1) && (errno == EINTR));
if ((poll_rc <= 0) || (qb_ipcc_connect_continue(c) != 0)) {
crm_info("Could not connect to %s IPC: %s", name,
(poll_rc == 0)?"timeout":strerror(errno));
rc = pcmk_rc_ipc_unresponsive;
if (poll_rc > 0) {
c = NULL; // qb_ipcc_connect_continue cleaned up for us
}
goto bail;
}
#endif

qb_rc = qb_ipcc_fd_get(c, &fd);
if (qb_rc != 0) {
Expand Down

0 comments on commit 2c937a4

Please sign in to comment.