From d07319507a7ee4726319f21b79cabbc7e34ac6d0 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Tue, 15 Sep 2020 06:33:04 -0700 Subject: [PATCH] resource: avoid segfault during teardown Problem: unloading resource module with events posted to eventlog in flight can resut in segfault. Program terminated with signal SIGSEGV, Segmentation fault. #0 __strcmp_avx2 () at ../sysdeps/x86_64/multiarch/strcmp-avx2.S:102 102 ../sysdeps/x86_64/multiarch/strcmp-avx2.S: No such file or directory. [Current thread is 1 (Thread 0x7fe74b7fe700 (LWP 3495430))] (gdb) bt #0 __strcmp_avx2 () at ../sysdeps/x86_64/multiarch/strcmp-avx2.S:102 #1 0x00007fe764f40de0 in aux_item_find (key=, head=0x7fe73c006180) at aux.c:88 #2 aux_get (head=, key=0x7fe764f5b000 "flux::log") at aux.c:119 #3 0x00007fe764f1f0d4 in getctx (h=h@entry=0x7fe73c00c6d0) at flog.c:72 #4 0x00007fe764f1f3a5 in flux_vlog (h=0x7fe73c00c6d0, level=7, fmt=0x7fe7606318fc "%s: %s event posted", ap=ap@entry=0x7fe74b7fd790) at flog.c:146 #5 0x00007fe764f1f333 in flux_log (h=, lev=lev@entry=7, fmt=fmt@entry=0x7fe7606318fc "%s: %s event posted") at flog.c:195 #6 0x00007fe76061166a in reslog_cb (reslog=, name=0x7fe73c016380 "online", arg=0x7fe73c013000) at acquire.c:319 #7 0x00007fe760610deb in notify_callback (event=, reslog=0x7fe73c005b90) at reslog.c:47 #8 post_handler (reslog=reslog@entry=0x7fe73c005b90, f=0x7fe73c00a510) at reslog.c:91 #9 0x00007fe760611250 in reslog_destroy (reslog=0x7fe73c005b90) at reslog.c:182 #10 0x00007fe76060e6b8 in resource_ctx_destroy (ctx=ctx@entry=0x7fe73c016640) at resource.c:129 #11 0x00007fe76060ef18 in resource_ctx_destroy (ctx=0x7fe73c016640) at resource.c:331 It looks like the acquire subsystem got a callback for a rank coming online after its context was freed. Set the reslog callback to NULL before destroying the acquire context. Also, set the monitor callback to NULL before destroying the discover context, as it appears this destructor has a similar safety issue. --- src/modules/resource/acquire.c | 1 + src/modules/resource/discover.c | 1 + 2 files changed, 2 insertions(+) diff --git a/src/modules/resource/acquire.c b/src/modules/resource/acquire.c index 785c04cd6f46..1642c6027594 100644 --- a/src/modules/resource/acquire.c +++ b/src/modules/resource/acquire.c @@ -379,6 +379,7 @@ void acquire_destroy (struct acquire *acquire) if (acquire) { int saved_errno = errno; flux_msg_handler_delvec (acquire->handlers); + reslog_set_callback (acquire->ctx->reslog, NULL, NULL); if (acquire->request) { if (flux_respond_error (acquire->ctx->h, acquire->request->msg, diff --git a/src/modules/resource/discover.c b/src/modules/resource/discover.c index 63bbf636301a..27a56a0662ae 100644 --- a/src/modules/resource/discover.c +++ b/src/modules/resource/discover.c @@ -278,6 +278,7 @@ void discover_destroy (struct discover *discover) { if (discover) { int saved_errno = errno; + monitor_set_callback (discover->ctx->monitor, NULL, NULL); flux_subprocess_destroy (discover->p); flux_future_destroy (discover->f); flux_msg_handler_delvec (discover->handlers);