Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Support delegate GDV guards in loop cloning #75140

Merged
merged 12 commits into from
Oct 20, 2022
8 changes: 5 additions & 3 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -7413,17 +7413,17 @@ class Compiler
Statement* stmt;
const unsigned loopNum;
const bool cloneForArrayBounds;
const bool cloneForTypeTests;
const bool cloneForGDVTests;
LoopCloneVisitorInfo(LoopCloneContext* context,
unsigned loopNum,
Statement* stmt,
bool cloneForArrayBounds,
bool cloneForTypeTests)
bool cloneForGDVTests)
: context(context)
, stmt(nullptr)
, loopNum(loopNum)
, cloneForArrayBounds(cloneForArrayBounds)
, cloneForTypeTests(cloneForTypeTests)
, cloneForGDVTests(cloneForGDVTests)
{
}
};
Expand All @@ -7437,6 +7437,8 @@ class Compiler
fgWalkResult optCanOptimizeByLoopCloning(GenTree* tree, LoopCloneVisitorInfo* info);
bool optObtainLoopCloningOpts(LoopCloneContext* context);
bool optIsLoopClonable(unsigned loopInd);
bool optCheckLoopCloningGDVTestProfitable(GenTreeOp* guard, LoopCloneVisitorInfo* info);
bool optIsHandleOrIndirOfHandle(GenTree* tree, GenTreeFlags handleType);

bool optLoopCloningEnabled();

Expand Down
3 changes: 2 additions & 1 deletion src/coreclr/jit/indirectcalltransformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ class IndirectCallTransformer
// TODO-GDV: Consider duplicating the store at the end of the
// cold case for the previous GDV. Then we can reuse the target
// if the second check of a chained GDV fails.
bool reuseTarget = (origCall->gtCallMoreFlags & GTF_CALL_M_GUARDED_DEVIRT_CHAIN) == 0;
bool reuseTarget = false; //(origCall->gtCallMoreFlags & GTF_CALL_M_GUARDED_DEVIRT_CHAIN) == 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be disabled just for method GDV?

Copy link
Member Author

@jakobbotsch jakobbotsch Sep 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean for vtable GDV only, and not for delegate GDV? This code does not run for class-based GDV.

As you can probably guess it's necessary because otherwise we cannot recognize the test in loop cloning. However, it does introduce an extra load on the cold path. Actually this seems fine on xarch as we can contain the load in the hot compare, so in a simple case of:

private static Func<long, long> s_f;

[MethodImpl(MethodImplOptions.NoInlining)]
private static long Foo()
{
    long result = 0;
    for (long i = 0; i < 100000; i++)
        result += s_f(i);

    return result;
}

The diff looks like:

@@ -33,19 +32,18 @@ G_M54051_IG02:
        mov      rbx, 0xD1FFAB1E      ; data for Program:s_f
                                                 ;; size=14 bbWeight=1    PerfScore 0.75
 G_M54051_IG03:
-       mov      rcx, gword ptr [rbx]
-       mov      r8, qword ptr [rcx+18H]
+       mov      r8, gword ptr [rbx]
        mov      rax, 0xD1FFAB1E      ; code for <>c:<Main>b__0_0
-       cmp      r8, rax
+       cmp      qword ptr [r8+18H], rax
        jne      SHORT G_M54051_IG07
        imul     rbp, rdi, 42

 G_M54051_IG04:
        add      rsi, rbp
        inc      rdi
        cmp      rdi, 0x186A0
        jl       SHORT G_M54051_IG03

 G_M54051_IG05:
        mov      rax, rsi
                                                 ;; size=3 bbWeight=0.67 PerfScore 0.17
@@ -58,12 +56,12 @@ G_M54051_IG06:
        ret
                                                 ;; size=9 bbWeight=0.67 PerfScore 2.17
 G_M54051_IG07:
-       mov      rcx, gword ptr [rcx+08H]
+       mov      rcx, gword ptr [r8+08H]
        mov      rdx, rdi
-       call     r8
+       call     [r8+18H]System.Func`2[long,long]:Invoke(long):long:this
        mov      rbp, rax
        jmp      SHORT G_M54051_IG04
-                                                ;; size=15 bbWeight=0    PerfScore 0.00
+                                                ;; size=16 bbWeight=0    PerfScore 0.00

On arm it's worse (depending on register pressure) since we have to load the delegate address into a register anyway, so we just stop reusing this register on the cold path:

@@ -43,19 +42,19 @@ G_M54051_IG02:
             movk    x23, #0xD1FFAB1E LSL #32
                                                 ;; size=40 bbWeight=1    PerfScore 5.00
 G_M54051_IG03:
-            ldr     x1, [x22]
-            ldr     x2, [x1, #0x18]
-            cmp     x2, x23
+            ldr     x2, [x22]
+            ldr     x0, [x2, #0x18]
+            cmp     x0, x23
             bne     G_M54051_IG07
             mov     x0, #42
             mul     x24, x20, x0

 G_M54051_IG04:
             add     x19, x19, x24
             add     x20, x20, #1
             cmp     x20, x21
             blt     G_M54051_IG03

 G_M54051_IG05:
             mov     x0, x19
                                                 ;; size=4 bbWeight=0.67 PerfScore 0.33
@@ -67,12 +66,13 @@ G_M54051_IG06:
             ret     lr
                                                 ;; size=20 bbWeight=0.67 PerfScore 3.33
 G_M54051_IG07:
-            ldr     x0, [x1, #0x08]
+            ldr     x0, [x2, #0x08]
             mov     x1, x20
+            ldr     x2, [x2, #0x18]
             blr     x2
             mov     x24, x0
             b       G_M54051_IG04
-                                                ;; size=20 bbWeight=0    PerfScore 0.00
+                                                ;; size=24 bbWeight=0    PerfScore 0.00

My opinion here is that this should ultimately be left up to CSE to decide. We may want to look at whether there is some CSE heuristics we should change for ARM given that less containment is possible there, so creating a CSE def in a hot block for a CSE use in a cold block may be more attractive (edit: opened #75253 for this).

This also does put delegate GDV in line with virtual class GDV. We do not reuse the method table indir in the cold case for virtual class GDVs either, even though we could.

if (origCall->IsVirtualVtable())
{
GenTree* tarTree = compiler->fgExpandVirtualVtableCallTarget(origCall);
Expand Down Expand Up @@ -638,6 +638,7 @@ class IndirectCallTransformer
TYP_I_IMPL);
GenTree* tarTree = compiler->gtNewOperNode(GT_ADD, TYP_BYREF, thisTree, offset);
tarTree = compiler->gtNewIndir(TYP_I_IMPL, tarTree);
tarTree->gtFlags |= GTF_IND_INVARIANT;

if (reuseTarget)
{
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/jitconfigvalues.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ CONFIG_INTEGER(JitBreakOnBadCode, W("JitBreakOnBadCode"), 0)
CONFIG_INTEGER(JitBreakOnMinOpts, W("JITBreakOnMinOpts"), 0) // Halt if jit switches to MinOpts
CONFIG_INTEGER(JitBreakOnUnsafeCode, W("JitBreakOnUnsafeCode"), 0)
CONFIG_INTEGER(JitCloneLoops, W("JitCloneLoops"), 1) // If 0, don't clone. Otherwise clone loops for optimizations.
CONFIG_INTEGER(JitCloneLoopsWithTypeTests, W("JitCloneLoopsWithTypeTests"), 1) // If 0, don't clone loops based on
// invariant type tests
CONFIG_INTEGER(JitCloneLoopsWithGdvTests, W("JitCloneLoopsWithGdvTests"), 1) // If 0, don't clone loops based on
// invariant type/method address tests
CONFIG_INTEGER(JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0) // In debug builds log places where loop cloning
// optimizations are performed on the fast path.
CONFIG_INTEGER(JitDefaultFill, W("JitDefaultFill"), 0xdd) // In debug builds, initialize the memory allocated by the nra
Expand Down
Loading