From 4330c110550242571da017a1b15ae0b765723ae8 Mon Sep 17 00:00:00 2001 From: FranckQC <89943638+FranckQC@users.noreply.github.com> Date: Sat, 27 Jul 2024 23:32:22 -0500 Subject: [PATCH] [Hexagon] Fix LWP assembly handler (predicate register) (#17204) * Fix LWP assembly handler (predicate register) (#2216) This solved the issue with LWP that appears with maxpool. The problem was that the LWP handler was forgetting to save p0 (used by the handler). This predicate register needs to be saved too, just like r0-r5, as it had been decided that it was the responsibility of the handler to save everything (even these theoretically caller-saved registers). Said differently, since it had been decided that calling the LWP handler would not follow the normal ABI, and that the LWP handler would save everything it touches (even normally caller-saved registers like r0-r15 and p0-3), then it absolutely needs to save the predicate registers too (in particular p0, which was causing the issue). The issue appeared only with maxpool because it's the only one that had a state saved in p0 before calling the LWP handler. And this call destroyed the content of what it had saved, making it subsequently branch to different portions of the code. Fix: Allocate 32 bytes (instead of 24 previously), in order to save p3:0, and I save those at the bottom of the stack. Restore it at the end of the LWP handler. * Remove training spaces --------- Co-authored-by: Slama, Franck --- src/runtime/hexagon/profiler/lwp_handler.S | 25 +++++++++++++++------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/runtime/hexagon/profiler/lwp_handler.S b/src/runtime/hexagon/profiler/lwp_handler.S index 611c0713111a..8cd02dd828f4 100644 --- a/src/runtime/hexagon/profiler/lwp_handler.S +++ b/src/runtime/hexagon/profiler/lwp_handler.S @@ -50,12 +50,17 @@ handler itself. .falign .type lwp_handler,@function lwp_handler: - { allocframe(#24) // Allocate 24 bytes on the stack to save R0-R5 registers + { + allocframe(#32) // Allocate 32 bytes on the stack to save R0-R5 registers (6*4bytes) and P0-P3 (4*1byte) + 4 unused bytes as the stack has to be 8-bytes aligned memd(r29+#-16) = r5:4 // Save R5,R4 + r5 = p3:0 // We will save P3:0 but we need an intermediate usual register (R5) that has already been saved + } + { + memd(r29+#16) = r3:2 // Save R3,R2 + memd(r29+#8) = r1:0 // Save R1, R0 } { - memd(r29+#8) = r3:2 // Save R3,R2 - memd(r29+#0) = r1:0 // Save R1, R0 + memw(r29+#0) = r5 // Save P3:0 (via R5) r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL) // Get GOT address } { @@ -102,14 +107,18 @@ lwp_handler: memw(r5+#8) = r0 // Save lower 32 bits } .falign -.LBB0_3: +.LBB0_3: // Restore the registers from the stack + { + r1 = memw(r29+#0) // We will restore P3:0 but need an intermediate usual register (R1) that hasn't already been restored + r5:4 = memd(r29+#24) // Restore R5:4 + } { - r5:4 = memd(r29+#16) // Restore the registers from the stack - r3:2 = memd(r29+#8) + r3:2 = memd(r29+#16) // Restore R3:2 + p3:0 = r1 // Restore P3:0 (via R1, not yet restored) } { - r1:0 = memd(r29+#0) - dealloc_return // Deallocate the stack and return + r1:0 = memd(r29+#8) // Restore R1:0 + dealloc_return // Deallocate the stack and return } .Lfunc_end0: .size lwp_handler, .Lfunc_end0-lwp_handler