From 05e1cac2496f842f70744dc5210fb3072ef32f3a Mon Sep 17 00:00:00 2001 From: Andrew Burgess Date: Sat, 22 Jul 2023 15:32:29 +0100 Subject: [PATCH] gdb: fix vfork regressions when target-non-stop is off It was pointed out on the mailing list[1] that after this commit: commit b1e0126ec56e099d753c20e91a9f8623aabd6b46 Date: Wed Jun 21 14:18:54 2023 +0100 gdb: don't resume vfork parent while child is still running the test gdb.base/vfork-follow-parent.exp now has some failures when run with the native-gdbserver or native-extended-gdbserver boards: FAIL: gdb.base/vfork-follow-parent.exp: resolution_method=schedule-multiple: continue to end of inferior 2 (timeout) FAIL: gdb.base/vfork-follow-parent.exp: resolution_method=schedule-multiple: inferior 1 (timeout) FAIL: gdb.base/vfork-follow-parent.exp: resolution_method=schedule-multiple: print unblock_parent = 1 (timeout) FAIL: gdb.base/vfork-follow-parent.exp: resolution_method=schedule-multiple: continue to break_parent (timeout) The reason that these failures don't show up when run on the standard unix board is that the test is only run in the default operating mode, so for Linux this will be all-stop on top of non-stop. If we adjust the test script so that it runs in the default mode and with target-non-stop turned off, then we see the same failures on the unix board. This commit includes this change. The way that the test is written means that it is not (currently) possible to turn on non-stop mode and have the test still work, so this commit does not do that. I have also updated the test script so that the vfork child performs an exec as well as the current exit. Exec and exit are the two ways in which a vfork child can release the vfork parent, so testing both of these cases is useful I think. In this test the inferior performs a vfork and the vfork-child immediately exits. The vfork-parent will wait for the vfork-child and then blocks waiting for gdb. Once gdb has released the vfork-parent, the vfork-parent also exits. In the test that fails, GDB sets 'detach-on-fork off' and then runs to the vfork. At this point the test tries to just "continue", but this fails as the vfork-parent is still selected, and the parent can't continue until the vfork-child completes. As the vfork-child is stopped by GDB the parent will never stop once resumed, so GDB refuses to resume it. The test script then sets 'schedule-multiple on' and once again continues. This time GDB, in theory, resumes both the parent and the child, the parent will be held blocked by the kernel, but the child will run until it exits, and which point GDB stops again, this time with inferior 2, the newly exited vfork-child, selected. What happens after this in the test script is irrelevant as far as this failure is concerned. To understand why the test started failing we should consider the behaviour of four different cases: 1. All-stop-on-non-stop before commit b1e0126ec56e, 2. All-stop-on-non-stop after commit b1e0126ec56e, 3. All-stop-on-all-stop before commit b1e0126ec56e, and 4. All-stop-on-all-stop after commit b1e0126ec56e. Only case #4 is failing after commit b1e0126ec56e, but I think the other cases are interesting because, (a) they inform how we might fix the regression, and (b) it turns out the behaviour of #2 changed too with the commit, but the change was harmless. For #1 All-stop-on-non-stop before commit b1e0126ec56e, what happens is: 1. GDB calls proceed with the vfork-parent selected, as schedule multiple is on user_visible_resume_ptid returns -1 (everything) as the resume_ptid (see proceed function), 2. As this is all-stop-on-non-stop, every thread is resumed individually, so GDB tries to resume both the vfork-parent and the vfork-child, both of which succeed, 3. The vfork-parent is held stopped by the kernel, 4. The vfork-child completes (exits) at which point the GDB sees the EXITED event for the vfork-child and the VFORK_DONE event for the vfork-parent, 5. At this point we might take two paths depending on which event GDB handles first, if GDB handles the VFORK_DONE first then: (a) As GDB is controlling both parent and child the VFORK_DONE is ignored (see handle_vfork_done), the vfork-parent will be resumed, (b) GDB processes the EXITED event, selects the (now defunct) vfork-child, and stops, returning control to the user. Alternatively, if GDB selects the EXITED event first then: (c) GDB processes the EXITED event, selects the (now defunct) vfork-child, and stops, returning control to the user. (d) At some future time the user resumes the vfork-parent, at which point the VFORK_DONE is reported to GDB, however, GDB is ignoring the VFORK_DONE (see handle_vfork_done), so the parent is resumed. For case #2, all-stop-on-non-stop after commit b1e0126ec56e, the important difference is in step (2) above, now, instead of resuming both the vfork-parent and the vfork-child, only the vfork-child is resumed. As such, when we get to step (5), only a single event, the EXITED event is reported. GDB handles the EXITED just as in (5)(c), then, later, when the user resumes the vfork-parent, the VFORKED_DONE is immediately delivered from the kernel, but this is ignored just as in (5)(d), and so, though the pattern of when the vfork-parent is resumed changes, the overall pattern of which events are reported and when, doesn't actually change. In fact, by not resuming the vfork-parent, the order of events (in this test) is now deterministic, which (maybe?) is a good thing. If we now consider case #3, all-stop-on-all-stop before commit b1e0126ec56e, then what happens is: 1. GDB calls proceed with the vfork-parent selected, as schedule multiple is on user_visible_resume_ptid returns -1 (everything) as the resume_ptid (see proceed function), 2. As this is all-stop-on-all-stop, the resume is passed down to the linux-nat target, the vfork-parent is the event thread, while the vfork-child is a sibling of the event thread, 3. In linux_nat_target::resume, GDB calls linux_nat_resume_callback for all threads, this causes the vfork-child to be resumed. Then in linux_nat_target::resume, the event thread, the vfork-parent, is also resumed. 4. The vfork-parent is held stopped by the kernel, 5. The vfork-child completes (exits) at which point the GDB sees the EXITED event for the vfork-child and the VFORK_DONE event for the vfork-parent, 6. We are now in a situation identical to step (5) as for all-stop-on-non-stop above, GDB selects one of the events to handle, and whichever we select the user sees the correct behaviour. And so, finally, we can consider #4, all-stop-on-all-stop after commit b1e0126ec56e, this is the case that started failing. We start out just like above, in proceed, the resume_ptid is -1 (resume everything), due to schedule multiple being on. And just like above, due to the target being all-stop, we call proceed_resume_thread_checked just once, for the current thread, which, remember, is the vfork-parent thread. The change in commit b1e0126ec56e was to avoid resuming a vfork-parent thread, read the commit message for the justification for this change. However, this means that GDB now rejects resuming the vfork-parent in this case, which means that nothing gets resumed! Obviously, if nothing resumes, then nothing will ever stop, and so GDB appears to hang. I considered a couple of solutions which, in the end, I didn't go with, these were: 1. Move the vfork-parent check out of proceed_resume_thread_checked, and place it in proceed, but only on the all-stop-on-non-stop path, this should still address the issue seen in b1e0126ec56e, but would avoid the issue seen here. I rejected this just because it didn't feel great to split the checks that exist in proceed_resume_thread_checked like this, 2. Extend the condition in proceed_resume_thread_checked by adding a target_is_non_stop_p check. This would have the same effect as idea 1, but leaves all the checks in the same place, which I think would be better, but this still just didn't feel right to me, and so, What I noticed was that for the all-stop-on-non-stop, after commit b1e0126ec56e, we only resumed the vfork-child, and this seems fine. The vfork-parent isn't going to run anyway (the kernel will hold it back), so if feels like we there's no harm in just waiting for the child to complete, and then resuming the parent. So then I started looking at follow_fork, which is called from the top of proceed. This function already has the task of switching between the parent and child based on which the user wishes to follow. So, I wondered, could we use this to switch to the vfork-child in the case that we are attached to both? Turns out this is pretty simple to do. Having done that, now the process is for all-stop-on-all-stop after commit b1e0126ec56e, and with this new fix is: 1. GDB calls proceed with the vfork-parent selected, but, 2. In follow_fork, and follow_fork_inferior, GDB switches the selected thread to be that of the vfork-child, 3. Back in proceed user_visible_resume_ptid returns -1 (everything) as the resume_ptid still, but now, 4. When GDB calls proceed_resume_thread_checked, the vfork-child is the current selected thread, this is not a vfork-parent, and so GDB allows the proceed to continue to the linux-nat target, 5. In linux_nat_target::resume, GDB calls linux_nat_resume_callback for all threads, this does not resume the vfork-parent (because it is a vfork-parent), and then the vfork-child is resumed as this is the event thread, At this point we are back in the same situation as for all-stop-on-non-stop after commit b1e0126ec56e, that is, the vfork-child is resumed, while the vfork-parent is held stopped by GDB. Eventually the vfork-child will exit or exec, at which point the vfork-parent will be resumed. [1] https://inbox.sourceware.org/gdb-patches/3e1e1db0-13d9-dd32-b4bb-051149ae6e76@simark.ca/ --- gdb/infrun.c | 8 +-- gdb/testsuite/gdb.base/vfork-follow-parent.c | 29 ++++++++++- .../gdb.base/vfork-follow-parent.exp | 50 ++++++++++++++++--- 3 files changed, 75 insertions(+), 12 deletions(-) diff --git a/gdb/infrun.c b/gdb/infrun.c index 8286026e6c6..72852e63906 100644 --- a/gdb/infrun.c +++ b/gdb/infrun.c @@ -713,7 +713,7 @@ holding the child stopped. Try \"set detach-on-fork\" or \ (do not restore the parent as the current inferior). */ gdb::optional maybe_restore; - if (!follow_child) + if (!follow_child && !sched_multi) maybe_restore.emplace (); switch_to_thread (*child_inf->threads ().begin ()); @@ -3400,8 +3400,10 @@ proceed (CORE_ADDR addr, enum gdb_signal siggnal) struct gdbarch *gdbarch; CORE_ADDR pc; - /* If we're stopped at a fork/vfork, follow the branch set by the - "set follow-fork-mode" command; otherwise, we'll just proceed + /* If we're stopped at a fork/vfork, switch to either the parent or child + thread as defined by the "set follow-fork-mode" command, or, if both + the parent and child are controlled by GDB, and schedule-multiple is + on, follow the child. If none of the above apply then we just proceed resuming the current thread. */ if (!follow_fork ()) { diff --git a/gdb/testsuite/gdb.base/vfork-follow-parent.c b/gdb/testsuite/gdb.base/vfork-follow-parent.c index df45b9c2dbe..15ff84a0bad 100644 --- a/gdb/testsuite/gdb.base/vfork-follow-parent.c +++ b/gdb/testsuite/gdb.base/vfork-follow-parent.c @@ -17,6 +17,10 @@ #include +#include +#include +#include + static volatile int unblock_parent = 0; static void @@ -25,7 +29,7 @@ break_parent (void) } int -main (void) +main (int argc, char **argv) { alarm (30); @@ -40,7 +44,28 @@ main (void) break_parent (); } else - _exit (0); + { +#if defined TEST_EXEC + char prog[PATH_MAX]; + int len; + + strcpy (prog, argv[0]); + len = strlen (prog); + for (; len > 0; --len) + { + if (prog[len - 1] == '/') + break; + } + strcpy (&prog[len], "vforked-prog"); + execlp (prog, prog, (char *) 0); + perror ("exec failed"); + _exit (1); +#elif defined TEST_EXIT + _exit (0); +#else +#error Define TEST_EXEC or TEST_EXIT +#endif + } return 0; } diff --git a/gdb/testsuite/gdb.base/vfork-follow-parent.exp b/gdb/testsuite/gdb.base/vfork-follow-parent.exp index 89c38001dac..70b54e729a5 100644 --- a/gdb/testsuite/gdb.base/vfork-follow-parent.exp +++ b/gdb/testsuite/gdb.base/vfork-follow-parent.exp @@ -19,10 +19,28 @@ # schedule-multiple on" or "set detach-on-fork on". Test these two resolution # methods. -standard_testfile +standard_testfile .c vforked-prog.c -if { [build_executable "failed to prepare" \ - ${testfile} ${srcfile}] } { +set binfile ${testfile}-exit +set binfile2 ${testfile}-exec +set binfile3 vforked-prog + +if { [build_executable "compile $binfile3" $binfile3 $srcfile2] } { + untested "failed to compile third test binary" + return -1 +} + +set remote_exec_prog [gdb_remote_download target $binfile3] + +set opts [list debug additional_flags=-DTEST_EXIT] +if { [build_executable "compile ${binfile}" ${binfile} ${srcfile} ${opts}] } { + untested "failed to compile first test binary" + return +} + +set opts [list debug additional_flags=-DTEST_EXEC] +if { [build_executable "compile ${binfile2}" ${binfile2} ${srcfile} ${opts}] } { + untested "failed to compile second test binary" return } @@ -31,8 +49,12 @@ if { [build_executable "failed to prepare" \ # or "schedule-multiple" (the two alternatives the message suggests to the # user). -proc do_test { resolution_method } { - clean_restart $::binfile +proc do_test { exec_file resolution_method target_non_stop non_stop } { + save_vars { ::GDBFLAGS } { + append ::GDBFLAGS " -ex \"maint set target-non-stop ${target_non_stop}\"" + append ::GDBFLAGS " -ex \"set non-stop ${non_stop}\"" + clean_restart $exec_file + } gdb_test_no_output "set detach-on-fork off" @@ -40,6 +62,10 @@ proc do_test { resolution_method } { return } + # Delete the breakpoint on main so we don't bit the breakpoint in + # the case that the vfork child performs an exec. + delete_breakpoints + gdb_test "break break_parent" gdb_test "continue" \ @@ -75,6 +101,16 @@ proc do_test { resolution_method } { "continue to break_parent" } -foreach_with_prefix resolution_method {detach-on-fork schedule-multiple} { - do_test $resolution_method +foreach_with_prefix exec_file [list $binfile $binfile2] { + foreach_with_prefix target-non-stop {on off} { + # This test was written assuming non-stop mode is off. + foreach_with_prefix non-stop {off} { + if {!${target-non-stop} && ${non-stop}} { + continue + } + foreach_with_prefix resolution_method {detach-on-fork schedule-multiple} { + do_test $exec_file $resolution_method ${target-non-stop} ${non-stop} + } + } + } }