Skip to content

Commit

Permalink
Merge pull request #3130 from dongahn/tv-jobid
Browse files Browse the repository at this point in the history
flux-job: add totalview_jobid support and misc. fixes
  • Loading branch information
mergify[bot] authored Aug 21, 2020
2 parents fad8759 + 11f1c67 commit 96b3edc
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 7 deletions.
20 changes: 15 additions & 5 deletions src/cmd/flux-job.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ int MPIR_i_am_starter = 1;
int MPIR_acquired_pre_main = 1;
int MPIR_force_to_main = 1;
int MPIR_partial_attach_ok = 1;
char *totalview_jobid = NULL;

int cmd_list (optparse_t *p, int argc, char **argv);
int cmd_list_inactive (optparse_t *p, int argc, char **argv);
Expand Down Expand Up @@ -224,6 +225,9 @@ static struct optparse_option attach_opts[] = {
{ .name = "quiet", .key = 'q', .has_arg = 0,
.usage = "Suppress warnings written to stderr from flux-job",
},
{ .name = "debug", .has_arg = 0,
.usage = "Enable parallel debugger to attach to a running job",
},
{ .name = "debug-emulate", .has_arg = 0, .flags = OPTPARSE_OPT_HIDDEN,
.usage = "Set MPIR_being_debugged for testing",
},
Expand Down Expand Up @@ -1679,9 +1683,7 @@ static void valid_or_exit_for_debug (struct attach_ctx *ctx)

if (state != FLUX_JOB_NEW && state != FLUX_JOB_DEPEND
&& state != FLUX_JOB_SCHED && state != FLUX_JOB_RUN) {
errno = EINVAL;
log_err_exit ("Invalid job state (%s) for debugging",
flux_job_statetostr(state, false));
log_msg_exit ("cannot debug job that isn't running");
}

return;
Expand Down Expand Up @@ -2065,10 +2067,17 @@ int cmd_attach (optparse_t *p, int argc, char **argv)
if (!(r = flux_get_reactor (ctx.h)))
log_err_exit ("flux_get_reactor");

if (optparse_hasopt (ctx.p, "debug-emulate"))
if (optparse_hasopt (ctx.p, "debug")
|| optparse_hasopt (ctx.p, "debug-emulate")) {
MPIR_being_debugged = 1;
if (MPIR_being_debugged)
}
if (MPIR_being_debugged) {
int verbose = optparse_getopt (p, "verbose", NULL);
valid_or_exit_for_debug (&ctx);
totalview_jobid = xasprintf ("%ju", (uintmax_t)ctx.id);
if (verbose > 1)
log_msg ("totalview_jobid=%s", totalview_jobid);
}

if (!(ctx.eventlog_f = flux_job_event_watch (ctx.h,
ctx.id,
Expand Down Expand Up @@ -2121,6 +2130,7 @@ int cmd_attach (optparse_t *p, int argc, char **argv)
flux_watcher_destroy (ctx.stdin_w);
flux_close (ctx.h);
free (ctx.service);
free (totalview_jobid);
return ctx.exit_code;
}

Expand Down
41 changes: 39 additions & 2 deletions t/t2611-debug-emulate.t
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,15 @@ stop_tasks_test() {
test_under_flux 2

parse_jobid() {
outfile=$1
jobid=$(cat ${outfile} | grep jobid | awk '{ print $2 }')
outfile=$1 &&
jobid=$(cat ${outfile} | grep ^jobid: | awk '{ print $2 }') &&
echo ${jobid}
}

parse_totalview_jobid() {
outfile=$1 &&
jobid=$(cat ${outfile} | grep totalview_jobid | \
awk '{ print $2 }' | awk -F= '{ print $2 }') &&
echo ${jobid}
}

Expand Down Expand Up @@ -68,4 +75,34 @@ test_expect_success 'debug-emulate: attaching to a failed job must fail' '
test_must_fail flux job attach --debug-emulate ${jobid}
'

test_expect_success 'debugger: totalview_jobid is set for attach mode' '
jobid=$(flux jobspec srun -n 1 ${stall} done2 10 | flux job submit) &&
jobid=$(flux job id ${jobid}) &&
flux job wait-event -vt 2.5 ${jobid} start &&
${waitfile} -v -t 2.5 done2 &&
flux job attach -vv --debug-emulate ${jobid} 2> jobid.out2 &&
flux job wait-event -vt 2.5 ${jobid} finish &&
tv_jobid=$(parse_totalview_jobid jobid.out2) &&
test ${tv_jobid} = "${jobid}"
'

flux_job_attach() {
flux job attach -vv --debug ${1} 2> ${2} &
${waitfile} -v -t 2.5 --pattern="totalview_jobid" ${2}
}

# flux job attach --debug JOBID must not continue target processes
test_expect_success 'debugger: job attach --debug must not continue target' '
jobid=$(flux jobspec srun -n 1 ${stall} done3 100 | flux job submit) &&
jobid=$(flux job id ${jobid}) &&
flux job wait-event -vt 2.5 ${jobid} start &&
${waitfile} -v -t 2.5 done3 &&
flux_job_attach ${jobid} jobid.out3 &&
tv_jobid=$(parse_totalview_jobid jobid.out3) &&
test ${tv_jobid} = "${jobid}" &&
test_must_fail flux job wait-event -vt 2.5 ${jobid} finish &&
flux job cancel ${jobid} &&
flux job wait-event -vt 2.5 ${jobid} finish
'

test_done

0 comments on commit 96b3edc

Please sign in to comment.