-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ineffective codegen for or pattern type checking #47920
Comments
Did you have a chance to look at 6.0 codegen? It may be better. If not, I'll take a look when I get a chance. |
I changed to |
I don't think sharplab's |
Historically, sharplab uses I've built latest master and viewed with disasmo. The problem still exists. By the way, |
Here are two other snippets than generates new asm, at least with SharpLab. I don't know if they are "better", but I observe that:
public bool M3(object o)
{
return o is not null and (int or uint or long);
}
public bool M4(object o)
{
return o switch
{
int or uint or long => true,
_ => false
};
} C.M3(System.Object)
L0000: push ebp
L0001: mov ebp, esp
L0003: test edx, edx
L0005: je short L002a
L0007: mov eax, edx
L0009: cmp dword ptr [eax], 0xca73b4
L000f: je short L0023
L0011: mov eax, edx
L0013: cmp dword ptr [eax], 0xca79b8
L0019: je short L0023
L001b: cmp dword ptr [edx], 0xca7fbc
L0021: jne short L002a
L0023: mov eax, 1
L0028: jmp short L002c
L002a: xor eax, eax
L002c: pop ebp
L002d: ret
C.M4(System.Object)
L0000: push ebp
L0001: mov ebp, esp
L0003: mov eax, edx
L0005: test eax, eax
L0007: je short L0011
L0009: cmp dword ptr [eax], 0xca73b4
L000f: jne short L0015
L0011: test edx, edx
L0013: jne short L0037
L0015: mov eax, edx
L0017: test eax, eax
L0019: je short L0023
L001b: cmp dword ptr [eax], 0xca79b8
L0021: jne short L0027
L0023: test edx, edx
L0025: jne short L0037
L0027: test edx, edx
L0029: je short L0033
L002b: cmp dword ptr [edx], 0xca7fbc
L0031: jne short L003e
L0033: test edx, edx
L0035: je short L003e
L0037: mov eax, 1
L003c: jmp short L0040
L003e: xor eax, eax
L0040: pop ebp
L0041: ret |
Well, even a simple bool Test(object o) => o is int; emits a sub-optimal codegen: ; Method Test(System.Object):bool:this
G_M7842_IG01: ;; offset=0000H
;; bbWeight=1 PerfScore 0.00
G_M7842_IG02: ;; offset=0000H
4885D2 test rdx, rdx
7411 je SHORT G_M7842_IG05
;; bbWeight=1 PerfScore 1.25
G_M7842_IG03: ;; offset=0005H
48B8A09B3944FF7F0000 mov rax, 0x7FFF44399BA0
483902 cmp qword ptr [rdx], rax
7402 je SHORT G_M7842_IG05
;; bbWeight=0.25 PerfScore 0.81
G_M7842_IG04: ;; offset=0014H
33D2 xor rdx, rdx
;; bbWeight=0.13 PerfScore 0.03
G_M7842_IG05: ;; offset=0016H
4885D2 test rdx, rdx
0F95C0 setne al
0FB6C0 movzx rax, al
;; bbWeight=1 PerfScore 1.50
G_M7842_IG06: ;; offset=001FH
C3 ret
;; bbWeight=1 PerfScore 1.00
; Total bytes of code: 32 The problem here is the fact we emit a giant tree for the following IL:
After importer:
We emit a huge tree for
|
6.0 codegen for M1 and M2 looks pretty good (better than the sharplab above, which is indeed showing 5.0 codegen). Main difference from 5.0 is likely the CSE of the method table fetches enabled by #45854. M1 could improve its handling of return values a bit. ; Assembly listing for method C:M1(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) ref -> zero-ref this class-hnd
; V01 arg1 [V01,T00] ( 5, 4 ) ref -> rdx class-hnd
;# V02 OutArgs [V02 ] ( 1, 1 ) lclBlk ( 0) [rsp+0x00] "OutgoingArgSpace"
; V03 tmp1 [V03,T01] ( 2, 2 ) ref -> rax class-hnd "spilling QMark2"
;* V04 tmp2 [V04,T04] ( 0, 0 ) ref -> zero-ref class-hnd "spilling QMark2"
; V05 tmp3 [V05,T02] ( 3, 2.25) ref -> rdx class-hnd "spilling QMark2"
; V06 cse0 [V06,T03] ( 4, 2 ) long -> rax "CSE - moderate"
;
; Lcl frame size = 0
G_M18147_IG01:
;; bbWeight=1 PerfScore 0.00
G_M18147_IG02:
test rdx, rdx
je SHORT G_M18147_IG09
;; bbWeight=1 PerfScore 1.25
G_M18147_IG03:
mov rax, rdx
mov rax, qword ptr [rax]
mov rcx, 0xD1FFAB1E
cmp rax, rcx
je SHORT G_M18147_IG07
mov rcx, 0xD1FFAB1E
cmp rax, rcx
je SHORT G_M18147_IG07
mov rcx, 0xD1FFAB1E
cmp rax, rcx
je SHORT G_M18147_IG05
;; bbWeight=0.50 PerfScore 3.38
G_M18147_IG04:
xor rdx, rdx
;; bbWeight=0.12 PerfScore 0.03
G_M18147_IG05:
test rdx, rdx
setne al
movzx rax, al
;; bbWeight=0.50 PerfScore 0.75
G_M18147_IG06:
ret
;; bbWeight=0.50 PerfScore 0.50
G_M18147_IG07:
mov eax, 1
;; bbWeight=0.50 PerfScore 0.12
G_M18147_IG08:
ret
;; bbWeight=0.50 PerfScore 0.50
G_M18147_IG09:
xor eax, eax
;; bbWeight=0.50 PerfScore 0.12
G_M18147_IG10:
ret
;; bbWeight=0.50 PerfScore 0.50 ; Assembly listing for method C:M2(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) ref -> zero-ref this class-hnd
; V01 arg1 [V01,T00] ( 4, 3.50) ref -> rdx class-hnd
;# V02 OutArgs [V02 ] ( 1, 1 ) lclBlk ( 0) [rsp+0x00] "OutgoingArgSpace"
; V03 cse0 [V03,T01] ( 4, 2 ) long -> rax "CSE - aggressive"
;
; Lcl frame size = 0
G_M2592_IG01:
;; bbWeight=1 PerfScore 0.00
G_M2592_IG02:
test rdx, rdx
je SHORT G_M2592_IG07
;; bbWeight=1 PerfScore 1.25
G_M2592_IG03:
mov rax, qword ptr [rdx]
mov rdx, 0xD1FFAB1E
cmp rax, rdx
je SHORT G_M2592_IG05
mov rdx, 0xD1FFAB1E
cmp rax, rdx
je SHORT G_M2592_IG05
mov rdx, 0xD1FFAB1E
cmp rax, rdx
sete al
movzx rax, al
;; bbWeight=0.50 PerfScore 3.38
G_M2592_IG04:
ret
;; bbWeight=0.50 PerfScore 0.50
G_M2592_IG05:
mov eax, 1
;; bbWeight=0.50 PerfScore 0.12
G_M2592_IG06:
ret
;; bbWeight=0.50 PerfScore 0.50
G_M2592_IG07:
xor eax, eax
;; bbWeight=0.50 PerfScore 0.12
G_M2592_IG08:
ret
;; bbWeight=0.50 PerfScore 0.50 |
However ; Assembly listing for method C:M0(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) ref -> zero-ref this class-hnd
; V01 arg1 [V01,T00] ( 7, 5 ) ref -> rdx class-hnd
;# V02 OutArgs [V02 ] ( 1, 1 ) lclBlk ( 0) [rsp+0x00] "OutgoingArgSpace"
; V03 tmp1 [V03,T01] ( 3, 4.50) ref -> rax class-hnd "spilling QMark2"
; V04 tmp2 [V04,T03] ( 3, 2.50) ref -> rax class-hnd "spilling QMark2"
; V05 tmp3 [V05,T02] ( 5, 3.75) ref -> rdx class-hnd "spilling QMark2"
;
; Lcl frame size = 0
G_M57506_IG01:
;; bbWeight=1 PerfScore 0.00
G_M57506_IG02:
mov rax, rdx
test rax, rax
je SHORT G_M57506_IG04
;; bbWeight=1 PerfScore 1.50
G_M57506_IG03:
mov rcx, 0xD1FFAB1E
cmp qword ptr [rax], rcx
jne SHORT G_M57506_IG05
;; bbWeight=0.25 PerfScore 0.81
G_M57506_IG04:
test rdx, rdx
jne SHORT G_M57506_IG13
;; bbWeight=0.50 PerfScore 0.62
G_M57506_IG05:
mov rax, rdx
test rax, rax
je SHORT G_M57506_IG07
;; bbWeight=0.50 PerfScore 0.75
G_M57506_IG06:
mov rcx, 0xD1FFAB1E
cmp qword ptr [rax], rcx
jne SHORT G_M57506_IG08
;; bbWeight=0.25 PerfScore 0.81
G_M57506_IG07:
test rdx, rdx
jne SHORT G_M57506_IG13
;; bbWeight=0.50 PerfScore 0.62
G_M57506_IG08:
test rdx, rdx
je SHORT G_M57506_IG11
;; bbWeight=0.50 PerfScore 0.62
G_M57506_IG09:
mov rax, 0xD1FFAB1E
cmp qword ptr [rdx], rax
je SHORT G_M57506_IG11
;; bbWeight=0.25 PerfScore 0.81
G_M57506_IG10:
xor rdx, rdx
;; bbWeight=0.12 PerfScore 0.03
G_M57506_IG11:
test rdx, rdx
setne al
movzx rax, al
;; bbWeight=0.50 PerfScore 0.75
G_M57506_IG12:
ret
;; bbWeight=0.50 PerfScore 0.50
G_M57506_IG13:
mov eax, 1
;; bbWeight=0.50 PerfScore 0.12
G_M57506_IG14:
ret
;; bbWeight=0.50 PerfScore 0.50 The jump-threading optimizations introduced in #46257 bail out as there is a temp assign (and hence side effect) in the block threading would like to avoid:
This temp assign is consequence of the expansion of qmarks done by morph in At any rate we can / should extend RBO to allow duplication of the side effecting code. I didn't do this initially because:
I don't see us getting to this in 6.0 given other priorities, though this sort of chained type test pattern is likely somewhat common (and more of it is on the way from PGO induced GDV) so we may find time to squeeze it in. |
For completeness, here's the 6.0 codegen for
; Assembly listing for method C:M3(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) ref -> zero-ref this class-hnd
; V01 arg1 [V01,T00] ( 4, 3.50) ref -> rdx class-hnd
; V02 loc0 [V02,T03] ( 3, 2 ) bool -> rax
;# V03 OutArgs [V03 ] ( 1, 1 ) lclBlk ( 0) [rsp+0x00] "OutgoingArgSpace"
; V04 tmp1 [V04,T01] ( 2, 2 ) ref -> rdx class-hnd "spilling QMark2"
;* V05 tmp2 [V05,T04] ( 0, 0 ) ref -> zero-ref class-hnd "spilling QMark2"
;* V06 tmp3 [V06,T05] ( 0, 0 ) ref -> zero-ref class-hnd "spilling QMark2"
; V07 cse0 [V07,T02] ( 4, 2 ) long -> rax "CSE - moderate"
;
; Lcl frame size = 0
G_M3425_IG01:
;; bbWeight=1 PerfScore 0.00
G_M3425_IG02:
test rdx, rdx
je SHORT G_M3425_IG05
;; bbWeight=1 PerfScore 1.25
G_M3425_IG03:
mov rax, qword ptr [rdx]
mov rdx, 0xD1FFAB1E
cmp rax, rdx
je SHORT G_M3425_IG04
mov rdx, 0xD1FFAB1E
cmp rax, rdx
je SHORT G_M3425_IG04
mov rdx, 0xD1FFAB1E
cmp rax, rdx
jne SHORT G_M3425_IG05
;; bbWeight=0.50 PerfScore 3.25
G_M3425_IG04:
mov eax, 1
jmp SHORT G_M3425_IG06
;; bbWeight=0.50 PerfScore 1.12
G_M3425_IG05:
xor eax, eax
;; bbWeight=0.50 PerfScore 0.12
G_M3425_IG06:
ret
;; bbWeight=1 PerfScore 1.00 ; Assembly listing for method C:M4(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) ref -> zero-ref this class-hnd
; V01 arg1 [V01,T00] ( 7, 5 ) ref -> rdx class-hnd
; V02 loc0 [V02,T04] ( 3, 2 ) bool -> rax
;# V03 OutArgs [V03 ] ( 1, 1 ) lclBlk ( 0) [rsp+0x00] "OutgoingArgSpace"
; V04 tmp1 [V04,T01] ( 3, 4.50) ref -> rax class-hnd "spilling QMark2"
; V05 tmp2 [V05,T02] ( 3, 2.50) ref -> rax class-hnd "spilling QMark2"
; V06 tmp3 [V06,T03] ( 3, 2.50) ref -> rdx class-hnd "spilling QMark2"
;
; Lcl frame size = 0
G_M53926_IG01:
;; bbWeight=1 PerfScore 0.00
G_M53926_IG02:
mov rax, rdx
test rax, rax
je SHORT G_M53926_IG04
;; bbWeight=1 PerfScore 1.50
G_M53926_IG03:
mov rcx, 0xD1FFAB1E
cmp qword ptr [rax], rcx
jne SHORT G_M53926_IG05
;; bbWeight=0.25 PerfScore 0.81
G_M53926_IG04:
test rdx, rdx
jne SHORT G_M53926_IG10
;; bbWeight=0.50 PerfScore 0.62
G_M53926_IG05:
mov rax, rdx
test rax, rax
je SHORT G_M53926_IG07
;; bbWeight=0.50 PerfScore 0.75
G_M53926_IG06:
mov rcx, 0xD1FFAB1E
cmp qword ptr [rax], rcx
jne SHORT G_M53926_IG08
;; bbWeight=0.25 PerfScore 0.81
G_M53926_IG07:
test rdx, rdx
jne SHORT G_M53926_IG10
;; bbWeight=0.50 PerfScore 0.62
G_M53926_IG08:
test rdx, rdx
je SHORT G_M53926_IG11
;; bbWeight=0.50 PerfScore 0.62
G_M53926_IG09:
mov rax, 0xD1FFAB1E
cmp qword ptr [rdx], rax
jne SHORT G_M53926_IG11
;; bbWeight=0.25 PerfScore 0.81
G_M53926_IG10:
mov eax, 1
jmp SHORT G_M53926_IG12
;; bbWeight=0.50 PerfScore 1.12
G_M53926_IG11:
xor eax, eax
;; bbWeight=0.50 PerfScore 0.12
G_M53926_IG12:
ret
;; bbWeight=1 PerfScore 1.00 |
@AndyAyersMS Do you think it makes sense to special case
in the importer, something like this EgorBo@6659f53 or it just should be handled via #48115? |
Handling it early is a good idea. The less IR/flow we introduce early, the better. |
Is the pattern
also handled? It looks to be common too. |
Need to test these out with #76476. |
I've investigated myself, and successfully optimized isinst+ldnull+ceq/cgt.un and isinst+brtrue/brfalse patterns locally. However, I have no idea how to handle the isinst+dup+x pattern, which is used by The hard part here is fgExpandQmarkForCastInstOf was written in mind with I also tried to introduce a new managed helper for doing the @EgorBo do you have any suggestions around these? |
Not really, ?? and other cases often produce multi-use boxing that we can't pattern match here
it can be implemented, just a bit of work + new JIT-EE API |
I'd like to try it. What information need to be passed in the interface? Eagerly load and pass the body of helper method? |
All of M0-M4 now produce very similar code with 9p5 ish bits (M2 has slightly different layout) ; Method C:M4(System.Object):ubyte:this (FullOpts)
G_M53340_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M53340_IG02: ;; offset=0x0000
test rdx, rdx
je SHORT G_M53340_IG05
;; size=5 bbWeight=1 PerfScore 1.25
G_M53340_IG03: ;; offset=0x0005
mov rax, qword ptr [rdx]
mov rcx, 0x7FFE205A5A10 ; System.Int32
cmp rax, rcx
je SHORT G_M53340_IG04
mov rcx, 0x7FFE2062CA58 ; System.UInt32
cmp rax, rcx
je SHORT G_M53340_IG04
mov rcx, 0x7FFE206613C0 ; System.Int64
cmp rax, rcx
jne SHORT G_M53340_IG05
;; size=48 bbWeight=0.25 PerfScore 1.62
G_M53340_IG04: ;; offset=0x0035
mov eax, 1
jmp SHORT G_M53340_IG06
;; size=7 bbWeight=0.50 PerfScore 1.12
G_M53340_IG05: ;; offset=0x003C
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M53340_IG06: ;; offset=0x003E
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 63 Egor's ; Method C:Test(System.Object):ubyte:this (FullOpts)
G_M56083_IG01: ;; offset=0x0000
;; size=0 bbWeight=1 PerfScore 0.00
G_M56083_IG02: ;; offset=0x0000
test rdx, rdx
je SHORT G_M56083_IG05
;; size=5 bbWeight=1 PerfScore 1.25
G_M56083_IG03: ;; offset=0x0005
mov rax, 0x7FFE20595A10 ; System.Int32
cmp qword ptr [rdx], rax
jne SHORT G_M56083_IG05
;; size=15 bbWeight=0.25 PerfScore 1.06
G_M56083_IG04: ;; offset=0x0014
jmp SHORT G_M56083_IG06
;; size=2 bbWeight=0.12 PerfScore 0.25
G_M56083_IG05: ;; offset=0x0016
xor rdx, rdx
;; size=2 bbWeight=0.25 PerfScore 0.06
G_M56083_IG06: ;; offset=0x0018
test rdx, rdx
setne al
movzx rax, al
;; size=9 bbWeight=1 PerfScore 1.50
G_M56083_IG07: ;; offset=0x0021
ret
;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 34 But looks like #103391 will address that. |
Description
Examine this code:
IL Compiled:
codegen:
sharplab
Looking through it:
The pattern matching codegen moves from rdx to rax for every test.
If there's no null check, JIT will generate null test for each condition.
The compiled IL of pattern matching uses
ldnull
andcgt
for the last test. This pattern may confuse the JIT.Configuration
sharplab default x64 (5.0 timeframe)
Regression?
Probably not.
category:cq
theme:redundant-branches
skill-level:expert
cost:medium
impact:small
The text was updated successfully, but these errors were encountered: