forked from MihaZupan/runtime-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JitDiff X64] [EgorBo] Remove redundant sign/zero extension for SIMD broadcasts #767
Comments
Top method regressions4 (1.40 % of base) - System.PackedSpanHelpers:Contains(byref,short,int):ubyte ; Assembly listing for method System.PackedSpanHelpers:Contains(byref,short,int):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 12, 10.50) byref -> rdi single-def
; V01 arg1 [V01,T05] ( 6, 3.75) short -> rsi single-def
; V02 arg2 [V02,T01] ( 15, 19.50) int -> rdx
; V03 loc0 [V03,T09] ( 3, 1.25) long -> rax
; V04 loc1 [V04,T00] ( 9, 22 ) byref -> rax
;* V05 loc2 [V05 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V06 loc3 [V06 ] ( 0, 0 ) byref -> zero-ref
;* V07 loc4 [V07 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V08 loc5 [V08 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V09 loc6 [V09 ] ( 0, 0 ) byref -> zero-ref
;* V10 loc7 [V10 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V11 loc8 [V11 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V12 loc9 [V12,T13] ( 3, 5 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V13 loc10 [V13,T06] ( 2, 4.50) byref -> rcx single-def
;* V14 loc11 [V14 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
;* V15 loc12 [V15 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V16 loc13 [V16,T07] ( 4, 2 ) byref -> rdi single-def
;* V17 loc14 [V17 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
;* V18 loc15 [V18 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V19 loc16 [V19,T16] ( 2, 1 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V20 loc17 [V20 ] ( 0, 0 ) byref -> zero-ref
;* V21 loc18 [V21 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V22 loc19 [V22 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V23 loc20 [V23,T08] ( 4, 2 ) byref -> rax single-def
;* V24 loc21 [V24 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V25 loc22 [V25 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;# V26 OutArgs [V26 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V27 tmp1 [V27,T10] ( 2, 1 ) byref -> rdi
;* V28 tmp2 [V28 ] ( 0, 0 ) simd16 -> zero-ref "impAppendStmt"
; V29 tmp3 [V29,T11] ( 2, 1 ) byref -> rax
;* V30 tmp4 [V30 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V31 tmp5 [V31 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V32 tmp6 [V32 ] ( 0, 0 ) simd32 -> zero-ref "field V05._lower (fldOffset=0x0)" P-INDEP
;* V33 tmp7 [V33 ] ( 0, 0 ) simd32 -> zero-ref "field V05._upper (fldOffset=0x20)" P-INDEP
;* V34 tmp8 [V34 ] ( 0, 0 ) simd32 -> zero-ref "field V07._lower (fldOffset=0x0)" P-INDEP
;* V35 tmp9 [V35 ] ( 0, 0 ) simd32 -> zero-ref "field V07._upper (fldOffset=0x20)" P-INDEP
;* V36 tmp10 [V36 ] ( 0, 0 ) simd32 -> zero-ref "field V08._lower (fldOffset=0x0)" P-INDEP
;* V37 tmp11 [V37 ] ( 0, 0 ) simd32 -> zero-ref "field V08._upper (fldOffset=0x20)" P-INDEP
;* V38 tmp12 [V38 ] ( 0, 0 ) simd32 -> zero-ref "field V10._lower (fldOffset=0x0)" P-INDEP
;* V39 tmp13 [V39 ] ( 0, 0 ) simd32 -> zero-ref "field V10._upper (fldOffset=0x20)" P-INDEP
;* V40 tmp14 [V40 ] ( 0, 0 ) simd32 -> zero-ref "field V11._lower (fldOffset=0x0)" P-INDEP
;* V41 tmp15 [V41 ] ( 0, 0 ) simd32 -> zero-ref "field V11._upper (fldOffset=0x20)" P-INDEP
; V42 cse0 [V42,T04] ( 7, 6.75) int -> rcx hoist multi-def "CSE #01: aggressive"
; V43 rat0 [V43,T03] ( 4, 12.25) long -> rax "Strength reduced derived IV"
; V44 rat1 [V44,T12] ( 3, 24 ) simd32 -> mm1 "ReplaceWithLclVar is creating a new local variable"
; V45 rat2 [V45,T14] ( 3, 3 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
; V46 rat3 [V46,T15] ( 3, 3 ) simd16 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M4358_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M4358_IG02:
cmp edx, 8
jl G_M4358_IG09
;; size=9 bbWeight=1 PerfScore 1.25
G_M4358_IG03:
mov rax, rdi
cmp edx, 16
jg SHORT G_M4358_IG06
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb xmm0, xmm0
add edx, -8
- movsxd rax, edx
- lea rax, bword ptr [rdi+2*rax]
+ movsxd rsi, edx
+ lea rax, bword ptr [rdi+2*rsi]
cmp rdi, rax
cmova rdi, rax
vmovups xmm1, xmmword ptr [rdi]
vpackuswb xmm1, xmm1, xmmword ptr [rax]
vpcmpeqb xmm0, xmm1, xmm0
vptest xmm0, xmm0
je G_M4358_IG13
- ;; size=61 bbWeight=0.50 PerfScore 8.88
+ ;; size=57 bbWeight=0.50 PerfScore 8.75
G_M4358_IG04:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M4358_IG05:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M4358_IG06:
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
cmp edx, 32
jle SHORT G_M4358_IG08
- lea ecx, [rdx-0x20]
- movsxd rcx, ecx
+ lea esi, [rdx-0x20]
+ movsxd rcx, esi
lea rcx, bword ptr [rax+2*rcx]
- align [0 bytes for IG07]
- ;; size=28 bbWeight=0.50 PerfScore 3.38
+ align [8 bytes for IG07]
+ ;; size=32 bbWeight=0.50 PerfScore 3.38
G_M4358_IG07:
vmovups ymm1, ymmword ptr [rax]
vpackuswb ymm1, ymm1, ymmword ptr [rax+0x20]
vpcmpeqb ymm1, ymm1, ymm0
vptest ymm1, ymm1
jne SHORT G_M4358_IG04
add rax, 64
cmp rax, rcx
jb SHORT G_M4358_IG07
;; size=29 bbWeight=4 PerfScore 64.00
G_M4358_IG08:
add edx, -16
movsxd rcx, edx
lea rdi, bword ptr [rdi+2*rcx]
cmp rax, rdi
cmova rax, rdi
vmovups ymm1, ymmword ptr [rax]
vpackuswb ymm1, ymm1, ymmword ptr [rdi]
vpcmpeqb ymm0, ymm1, ymm0
vptest ymm0, ymm0
je SHORT G_M4358_IG13
jmp SHORT G_M4358_IG04
;; size=38 bbWeight=0.50 PerfScore 9.00
G_M4358_IG09:
xor eax, eax
cmp edx, 4
jl SHORT G_M4358_IG10
add edx, -4
movsx rax, word ptr [rdi]
movsx rcx, si
cmp eax, ecx
- je SHORT G_M4358_IG04
+ je G_M4358_IG04
movsx rax, word ptr [rdi+0x02]
cmp eax, ecx
je G_M4358_IG04
movsx rax, word ptr [rdi+0x04]
cmp eax, ecx
je G_M4358_IG04
movsx rax, word ptr [rdi+0x06]
cmp eax, ecx
je G_M4358_IG04
mov eax, 4
- ;; size=66 bbWeight=0.50 PerfScore 11.62
+ ;; size=70 bbWeight=0.50 PerfScore 11.62
G_M4358_IG10:
test edx, edx
jle SHORT G_M4358_IG13
;; size=4 bbWeight=0.50 PerfScore 0.62
G_M4358_IG11:
movsx rcx, si
add rax, rax
align [0 bytes for IG12]
;; size=7 bbWeight=0.25 PerfScore 0.12
G_M4358_IG12:
dec edx
movsx rsi, word ptr [rdi+rax]
cmp esi, ecx
je G_M4358_IG04
add rax, 2
test edx, edx
jg SHORT G_M4358_IG12
;; size=23 bbWeight=4 PerfScore 28.00
G_M4358_IG13:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M4358_IG14:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
-; Total bytes of code 286, prolog size 4, PerfScore 130.88, instruction count 86, allocated bytes for code 290 (MethodHash=f310eef9) for method System.PackedSpanHelpers:Contains(byref,short,int):ubyte (FullOpts)
+; Total bytes of code 290, prolog size 4, PerfScore 130.75, instruction count 84, allocated bytes for code 290 (MethodHash=f310eef9) for method System.PackedSpanHelpers:Contains(byref,short,int):ubyte (FullOpts)
; ============================================================ |
Top method improvements-30 (-7.18 % of base) - System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int ; Assembly listing for method System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 19 single block inlinees; 5 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 15, 12 ) byref -> rdi single-def
; V01 arg1 [V01,T06] ( 6, 3.75) short -> rsi single-def
; V02 arg2 [V02,T01] ( 15, 19.50) int -> rdx
; V03 loc0 [V03,T03] ( 6, 13.50) long -> rcx
; V04 loc1 [V04,T00] ( 10, 22.50) byref -> rax
;* V05 loc2 [V05 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V06 loc3 [V06 ] ( 0, 0 ) byref -> zero-ref
;* V07 loc4 [V07 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V08 loc5 [V08 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V09 loc6 [V09 ] ( 0, 0 ) byref -> zero-ref
;* V10 loc7 [V10 ] ( 0, 0 ) byref -> zero-ref
;* V11 loc8 [V11 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V12 loc9 [V12 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V13 loc10 [V13,T17] ( 3, 5 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V14 loc11 [V14,T07] ( 2, 4.50) byref -> rcx single-def
;* V15 loc12 [V15 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
;* V16 loc13 [V16 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V17 loc14 [V17,T16] ( 4, 12.50) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V18 loc15 [V18,T10] ( 5, 2.50) byref -> rcx single-def
;* V19 loc16 [V19 ] ( 0, 0 ) byref -> zero-ref single-def
;* V20 loc17 [V20 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
;* V21 loc18 [V21 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V22 loc19 [V22,T18] ( 4, 2 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V23 loc20 [V23,T20] ( 2, 1 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V24 loc21 [V24 ] ( 0, 0 ) byref -> zero-ref
;* V25 loc22 [V25 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V26 loc23 [V26 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V27 loc24 [V27 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V28 loc25 [V28,T11] ( 5, 2.50) byref -> rax single-def
;* V29 loc26 [V29 ] ( 0, 0 ) byref -> zero-ref single-def
;* V30 loc27 [V30 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V31 loc28 [V31 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V32 loc29 [V32,T19] ( 4, 2 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;# V33 OutArgs [V33 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V34 tmp1 [V34,T14] ( 3, 1.50) byref -> rcx
;* V35 tmp2 [V35 ] ( 0, 0 ) simd16 -> zero-ref "impAppendStmt"
;* V36 tmp3 [V36 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
; V37 tmp4 [V37,T15] ( 3, 1.50) byref -> rax
;* V38 tmp5 [V38 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V39 tmp6 [V39 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V40 tmp7 [V40 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V41 tmp8 [V41 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V42 tmp9 [V42 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V43 tmp10 [V43 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V44 tmp11 [V44 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V45 tmp12 [V45 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V46 tmp13 [V46 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V47 tmp14 [V47 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V48 tmp15 [V48 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V49 tmp16 [V49 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V50 tmp17 [V50 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V51 tmp18 [V51 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V52 tmp19 [V52 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V53 tmp20 [V53 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V54 tmp21 [V54 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V55 tmp22 [V55 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V56 tmp23 [V56,T12] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
; V57 tmp24 [V57,T08] ( 3, 3 ) byref -> rax "Inlining Arg"
;* V58 tmp25 [V58 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V59 tmp26 [V59,T13] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
; V60 tmp27 [V60,T09] ( 3, 3 ) byref -> rcx "Inlining Arg"
; V61 tmp28 [V61,T04] ( 7, 7 ) int -> rax "Single return block return value"
;* V62 tmp29 [V62 ] ( 0, 0 ) simd32 -> zero-ref "field V05._lower (fldOffset=0x0)" P-INDEP
;* V63 tmp30 [V63 ] ( 0, 0 ) simd32 -> zero-ref "field V05._upper (fldOffset=0x20)" P-INDEP
;* V64 tmp31 [V64 ] ( 0, 0 ) simd32 -> zero-ref "field V07._lower (fldOffset=0x0)" P-INDEP
;* V65 tmp32 [V65 ] ( 0, 0 ) simd32 -> zero-ref "field V07._upper (fldOffset=0x20)" P-INDEP
;* V66 tmp33 [V66 ] ( 0, 0 ) simd32 -> zero-ref "field V08._lower (fldOffset=0x0)" P-INDEP
;* V67 tmp34 [V67 ] ( 0, 0 ) simd32 -> zero-ref "field V08._upper (fldOffset=0x20)" P-INDEP
;* V68 tmp35 [V68 ] ( 0, 0 ) simd32 -> zero-ref "field V11._lower (fldOffset=0x0)" P-INDEP
;* V69 tmp36 [V69 ] ( 0, 0 ) simd32 -> zero-ref "field V11._upper (fldOffset=0x20)" P-INDEP
;* V70 tmp37 [V70 ] ( 0, 0 ) simd32 -> zero-ref "field V12._lower (fldOffset=0x0)" P-INDEP
;* V71 tmp38 [V71 ] ( 0, 0 ) simd32 -> zero-ref "field V12._upper (fldOffset=0x20)" P-INDEP
; V72 cse0 [V72,T05] ( 7, 6.75) int -> rax hoist multi-def "CSE #01: aggressive"
;
; Lcl frame size = 0
G_M26041_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M26041_IG02:
cmp edx, 8
jl G_M26041_IG10
;; size=9 bbWeight=1 PerfScore 1.25
G_M26041_IG03:
mov rax, rdi
cmp edx, 16
jg SHORT G_M26041_IG05
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb xmm0, xmm0
add edx, -8
- movsxd rax, edx
- lea rax, bword ptr [rdi+2*rax]
+ movsxd rsi, edx
+ lea rax, bword ptr [rdi+2*rsi]
cmp rdi, rax
mov rcx, rdi
cmova rcx, rax
vmovups xmm1, xmmword ptr [rcx]
vpackuswb xmm1, xmm1, xmmword ptr [rax]
vpcmpeqb xmm0, xmm1, xmm0
vptest xmm0, xmm0
je G_M26041_IG14
vpmovmskb edx, xmm0
tzcnt edx, edx
cmp edx, 8
jl SHORT G_M26041_IG04
mov rcx, rax
add edx, -8
- ;; size=83 bbWeight=0.50 PerfScore 11.88
+ ;; size=79 bbWeight=0.50 PerfScore 11.75
G_M26041_IG04:
sub rcx, rdi
shr rcx, 1
lea eax, [rcx+rdx]
jmp G_M26041_IG17
- align [6 bytes for IG06]
- ;; size=20 bbWeight=0.50 PerfScore 1.62
+ align [0 bytes for IG06]
+ ;; size=14 bbWeight=0.50 PerfScore 1.62
G_M26041_IG05:
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
cmp edx, 32
jle SHORT G_M26041_IG07
- lea ecx, [rdx-0x20]
- movsxd rcx, ecx
+ lea esi, [rdx-0x20]
+ movsxd rcx, esi
lea rcx, bword ptr [rax+2*rcx]
- ;; size=28 bbWeight=0.50 PerfScore 3.38
+ ;; size=24 bbWeight=0.50 PerfScore 3.25
G_M26041_IG06:
vmovups ymm1, ymmword ptr [rax]
vpackuswb ymm1, ymm1, ymmword ptr [rax+0x20]
vpcmpeqb ymm1, ymm1, ymm0
vptest ymm1, ymm1
jne SHORT G_M26041_IG09
add rax, 64
cmp rax, rcx
jb SHORT G_M26041_IG06
;; size=29 bbWeight=4 PerfScore 64.00
G_M26041_IG07:
add edx, -16
movsxd rcx, edx
lea rcx, bword ptr [rdi+2*rcx]
cmp rax, rcx
cmova rax, rcx
vmovups ymm1, ymmword ptr [rax]
vpackuswb ymm1, ymm1, ymmword ptr [rcx]
vpcmpeqb ymm0, ymm1, ymm0
vptest ymm0, ymm0
je G_M26041_IG14
vpermq ymm0, ymm0, -40
vpmovmskb edx, ymm0
tzcnt edx, edx
cmp edx, 16
jl SHORT G_M26041_IG08
mov rax, rcx
add edx, -16
;; size=65 bbWeight=0.50 PerfScore 12.38
G_M26041_IG08:
sub rax, rdi
shr rax, 1
add eax, edx
- jmp G_M26041_IG17
- align [13 bytes for IG13]
- ;; size=26 bbWeight=0.50 PerfScore 1.50
+ jmp SHORT G_M26041_IG17
+ align [0 bytes for IG13]
+ ;; size=10 bbWeight=0.50 PerfScore 1.50
G_M26041_IG09:
sub rax, rdi
shr rax, 1
vpermq ymm0, ymm1, -40
vpmovmskb edi, ymm0
xor ecx, ecx
tzcnt ecx, edi
add eax, ecx
jmp SHORT G_M26041_IG17
;; size=26 bbWeight=0.50 PerfScore 5.12
G_M26041_IG10:
xor ecx, ecx
cmp edx, 4
jl SHORT G_M26041_IG11
add edx, -4
movsx rcx, word ptr [rdi]
movsx rax, si
cmp ecx, eax
je SHORT G_M26041_IG22
movsx rcx, word ptr [rdi+0x02]
cmp ecx, eax
je SHORT G_M26041_IG20
movsx rcx, word ptr [rdi+0x04]
cmp ecx, eax
je SHORT G_M26041_IG18
movsx rcx, word ptr [rdi+0x06]
cmp ecx, eax
je SHORT G_M26041_IG16
mov ecx, 4
;; size=54 bbWeight=0.50 PerfScore 11.62
G_M26041_IG11:
test edx, edx
jle SHORT G_M26041_IG14
;; size=4 bbWeight=0.50 PerfScore 0.62
G_M26041_IG12:
movsx rax, si
;; size=4 bbWeight=0.25 PerfScore 0.06
G_M26041_IG13:
dec edx
movsx rsi, word ptr [rdi+2*rcx]
cmp esi, eax
je SHORT G_M26041_IG15
inc rcx
test edx, edx
jg SHORT G_M26041_IG13
;; size=18 bbWeight=4 PerfScore 28.00
G_M26041_IG14:
mov eax, -1
jmp SHORT G_M26041_IG17
;; size=7 bbWeight=0.50 PerfScore 1.12
G_M26041_IG15:
mov eax, ecx
jmp SHORT G_M26041_IG17
;; size=4 bbWeight=0.50 PerfScore 1.12
G_M26041_IG16:
mov eax, 3
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M26041_IG17:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M26041_IG18:
mov eax, 2
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M26041_IG19:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M26041_IG20:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M26041_IG21:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M26041_IG22:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M26041_IG23:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
-; Total bytes of code 418, prolog size 4, PerfScore 150.44, instruction count 126, allocated bytes for code 418 (MethodHash=25ef9a46) for method System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int (FullOpts)
+; Total bytes of code 388, prolog size 4, PerfScore 150.19, instruction count 124, allocated bytes for code 391 (MethodHash=25ef9a46) for method System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int (FullOpts)
; ============================================================ -15 (-2.19 % of base) - System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int ; Assembly listing for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 11 single block inlinees; 5 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T03] ( 15, 12 ) byref -> rdi single-def
; V01 arg1 [V01,T10] ( 6, 3.75) short -> rsi single-def
; V02 arg2 [V02,T05] ( 9, 7 ) short -> rdx single-def
; V03 arg3 [V03,T06] ( 9, 7 ) short -> rcx single-def
; V04 arg4 [V04,T01] ( 15, 19.50) int -> r8
; V05 loc0 [V05,T04] ( 6, 13.50) long -> r9
; V06 loc1 [V06,T02] ( 20, 20 ) short -> rax
; V07 loc2 [V07,T00] ( 10, 22.50) byref -> rax
;* V08 loc3 [V08 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V09 loc4 [V09 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V10 loc5 [V10 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V11 loc6 [V11 ] ( 0, 0 ) byref -> zero-ref
;* V12 loc7 [V12 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V13 loc8 [V13 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V14 loc9 [V14 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V15 loc10 [V15 ] ( 0, 0 ) byref -> zero-ref
;* V16 loc11 [V16 ] ( 0, 0 ) byref -> zero-ref
;* V17 loc12 [V17 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V18 loc13 [V18 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V19 loc14 [V19 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V20 loc15 [V20,T26] ( 3, 5 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V21 loc16 [V21,T27] ( 3, 5 ) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V22 loc17 [V22,T28] ( 3, 5 ) simd32 -> mm2 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V23 loc18 [V23,T11] ( 2, 4.50) byref -> rcx single-def
;* V24 loc19 [V24 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
; V25 loc20 [V25,T24] ( 4, 16 ) simd32 -> mm3 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V26 loc21 [V26,T25] ( 4, 12.50) simd32 -> mm3 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V27 loc22 [V27,T14] ( 5, 2.50) byref -> rcx single-def
;* V28 loc23 [V28 ] ( 0, 0 ) byref -> zero-ref single-def
;* V29 loc24 [V29 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
; V30 loc25 [V30,T29] ( 4, 2 ) simd32 -> mm3 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V31 loc26 [V31,T30] ( 4, 2 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V32 loc27 [V32,T33] ( 2, 1 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V33 loc28 [V33,T34] ( 2, 1 ) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V34 loc29 [V34,T35] ( 2, 1 ) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V35 loc30 [V35 ] ( 0, 0 ) byref -> zero-ref
;* V36 loc31 [V36 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V37 loc32 [V37 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V38 loc33 [V38 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V39 loc34 [V39,T15] ( 5, 2.50) byref -> rax single-def
;* V40 loc35 [V40 ] ( 0, 0 ) byref -> zero-ref single-def
;* V41 loc36 [V41 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
; V42 loc37 [V42,T31] ( 4, 2 ) simd16 -> mm3 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V43 loc38 [V43,T32] ( 4, 2 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;# V44 OutArgs [V44 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V45 tmp1 [V45,T18] ( 3, 1.50) byref -> rcx
+; V45 tmp1 [V45,T18] ( 3, 1.50) byref -> rdx
;* V46 tmp2 [V46 ] ( 0, 0 ) simd16 -> zero-ref "impAppendStmt"
; V47 tmp3 [V47,T19] ( 3, 1.50) byref -> rax
;* V48 tmp4 [V48 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V49 tmp5 [V49 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
; V50 tmp6 [V50,T09] ( 2, 4 ) int -> rax
; V51 tmp7 [V51,T20] ( 2, 1 ) int -> rax
; V52 tmp8 [V52,T21] ( 2, 1 ) int -> rax
; V53 tmp9 [V53,T22] ( 2, 1 ) int -> rax
; V54 tmp10 [V54,T23] ( 2, 1 ) int -> rax
;* V55 tmp11 [V55 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V56 tmp12 [V56 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V57 tmp13 [V57 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V58 tmp14 [V58 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V59 tmp15 [V59 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V60 tmp16 [V60 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V61 tmp17 [V61 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V62 tmp18 [V62 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V63 tmp19 [V63 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V64 tmp20 [V64,T16] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
; V65 tmp21 [V65,T12] ( 3, 3 ) byref -> rax "Inlining Arg"
;* V66 tmp22 [V66 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V67 tmp23 [V67,T17] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
-; V68 tmp24 [V68,T13] ( 3, 3 ) byref -> rcx "Inlining Arg"
+; V67 tmp23 [V67,T17] ( 5, 2.50) int -> rcx "Inline stloc first use temp"
+; V68 tmp24 [V68,T13] ( 3, 3 ) byref -> rdx "Inlining Arg"
; V69 tmp25 [V69,T07] ( 7, 7 ) int -> rax "Single return block return value"
;* V70 tmp26 [V70 ] ( 0, 0 ) simd32 -> zero-ref "field V08._lower (fldOffset=0x0)" P-INDEP
;* V71 tmp27 [V71 ] ( 0, 0 ) simd32 -> zero-ref "field V08._upper (fldOffset=0x20)" P-INDEP
;* V72 tmp28 [V72 ] ( 0, 0 ) simd32 -> zero-ref "field V09._lower (fldOffset=0x0)" P-INDEP
;* V73 tmp29 [V73 ] ( 0, 0 ) simd32 -> zero-ref "field V09._upper (fldOffset=0x20)" P-INDEP
;* V74 tmp30 [V74 ] ( 0, 0 ) simd32 -> zero-ref "field V10._lower (fldOffset=0x0)" P-INDEP
;* V75 tmp31 [V75 ] ( 0, 0 ) simd32 -> zero-ref "field V10._upper (fldOffset=0x20)" P-INDEP
;* V76 tmp32 [V76 ] ( 0, 0 ) simd32 -> zero-ref "field V12._lower (fldOffset=0x0)" P-INDEP
;* V77 tmp33 [V77 ] ( 0, 0 ) simd32 -> zero-ref "field V12._upper (fldOffset=0x20)" P-INDEP
;* V78 tmp34 [V78 ] ( 0, 0 ) simd32 -> zero-ref "field V13._lower (fldOffset=0x0)" P-INDEP
;* V79 tmp35 [V79 ] ( 0, 0 ) simd32 -> zero-ref "field V13._upper (fldOffset=0x20)" P-INDEP
;* V80 tmp36 [V80 ] ( 0, 0 ) simd32 -> zero-ref "field V14._lower (fldOffset=0x0)" P-INDEP
;* V81 tmp37 [V81 ] ( 0, 0 ) simd32 -> zero-ref "field V14._upper (fldOffset=0x20)" P-INDEP
;* V82 tmp38 [V82 ] ( 0, 0 ) simd32 -> zero-ref "field V17._lower (fldOffset=0x0)" P-INDEP
;* V83 tmp39 [V83 ] ( 0, 0 ) simd32 -> zero-ref "field V17._upper (fldOffset=0x20)" P-INDEP
;* V84 tmp40 [V84 ] ( 0, 0 ) simd32 -> zero-ref "field V18._lower (fldOffset=0x0)" P-INDEP
;* V85 tmp41 [V85 ] ( 0, 0 ) simd32 -> zero-ref "field V18._upper (fldOffset=0x20)" P-INDEP
;* V86 tmp42 [V86 ] ( 0, 0 ) simd32 -> zero-ref "field V19._lower (fldOffset=0x0)" P-INDEP
;* V87 tmp43 [V87 ] ( 0, 0 ) simd32 -> zero-ref "field V19._upper (fldOffset=0x20)" P-INDEP
; V88 cse0 [V88,T08] ( 7, 6.75) int -> r10 hoist multi-def "CSE #01: moderate"
;
; Lcl frame size = 0
G_M33471_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M33471_IG02:
cmp r8d, 8
jl G_M33471_IG10
;; size=10 bbWeight=1 PerfScore 1.25
G_M33471_IG03:
mov rax, rdi
cmp r8d, 16
- jg G_M33471_IG05
- movzx rsi, sil
+ jg SHORT G_M33471_IG05
+ ;; NOP compensation instructions of 4 bytes.
vmovd xmm0, esi
vpbroadcastb xmm0, xmm0
- movzx rdx, dl
vmovd xmm1, edx
vpbroadcastb xmm1, xmm1
- movzx rcx, cl
vmovd xmm2, ecx
vpbroadcastb xmm2, xmm2
add r8d, -8
- movsxd rax, r8d
- lea rax, bword ptr [rdi+2*rax]
+ movsxd rsi, r8d
+ lea rax, bword ptr [rdi+2*rsi]
cmp rdi, rax
- mov rcx, rdi
- cmova rcx, rax
- vmovups xmm3, xmmword ptr [rcx]
+ mov rdx, rdi
+ cmova rdx, rax
+ vmovups xmm3, xmmword ptr [rdx]
vpackuswb xmm3, xmm3, xmmword ptr [rax]
vpcmpeqb xmm0, xmm3, xmm0
vpcmpeqb xmm1, xmm3, xmm1
vpor xmm0, xmm1, xmm0
vpcmpeqb xmm1, xmm3, xmm2
vpor xmm0, xmm1, xmm0
vptest xmm0, xmm0
je G_M33471_IG16
- vpmovmskb edx, xmm0
- tzcnt edx, edx
- cmp edx, 8
+ vpmovmskb ecx, xmm0
+ tzcnt ecx, ecx
+ cmp ecx, 8
jl SHORT G_M33471_IG04
- mov rcx, rax
- add edx, -8
- ;; size=129 bbWeight=0.50 PerfScore 15.96
+ mov rdx, rax
+ add ecx, -8
+ ;; size=119 bbWeight=0.50 PerfScore 15.58
G_M33471_IG04:
- sub rcx, rdi
- shr rcx, 1
- lea eax, [rcx+rdx]
+ sub rdx, rdi
+ shr rdx, 1
+ lea eax, [rdx+rcx]
jmp G_M33471_IG19
- align [0 bytes for IG06]
- ;; size=14 bbWeight=0.50 PerfScore 1.62
+ align [1 bytes for IG06]
+ ;; size=15 bbWeight=0.50 PerfScore 1.62
G_M33471_IG05:
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
- movzx rdx, dl
vmovd xmm1, edx
vpbroadcastb ymm1, ymm1
- movzx rcx, cl
vmovd xmm2, ecx
vpbroadcastb ymm2, ymm2
cmp r8d, 32
jle SHORT G_M33471_IG07
- lea ecx, [r8-0x20]
- movsxd rcx, ecx
- lea rcx, bword ptr [rax+2*rcx]
- ;; size=54 bbWeight=0.50 PerfScore 7.62
+ lea esi, [r8-0x20]
+ movsxd rdx, esi
+ lea rcx, bword ptr [rax+2*rdx]
+ ;; size=44 bbWeight=0.50 PerfScore 7.25
G_M33471_IG06:
vmovups ymm3, ymmword ptr [rax]
vpackuswb ymm3, ymm3, ymmword ptr [rax+0x20]
vpcmpeqb ymm4, ymm3, ymm0
vpcmpeqb ymm5, ymm3, ymm1
vpor ymm4, ymm5, ymm4
vpcmpeqb ymm3, ymm3, ymm2
vpor ymm3, ymm3, ymm4
vptest ymm3, ymm3
jne SHORT G_M33471_IG09
add rax, 64
cmp rax, rcx
jb SHORT G_M33471_IG06
;; size=45 bbWeight=4 PerfScore 70.67
G_M33471_IG07:
add r8d, -16
movsxd rcx, r8d
lea rcx, bword ptr [rdi+2*rcx]
cmp rax, rcx
cmova rax, rcx
vmovups ymm3, ymmword ptr [rax]
vpackuswb ymm3, ymm3, ymmword ptr [rcx]
vpcmpeqb ymm0, ymm0, ymm3
vpcmpeqb ymm1, ymm1, ymm3
vpor ymm0, ymm1, ymm0
vpcmpeqb ymm1, ymm2, ymm3
vpor ymm0, ymm1, ymm0
vptest ymm0, ymm0
je G_M33471_IG16
vpermq ymm0, ymm0, -40
vpmovmskb edx, ymm0
tzcnt edx, edx
cmp edx, 16
jl SHORT G_M33471_IG08
mov rax, rcx
add edx, -16
;; size=82 bbWeight=0.50 PerfScore 13.21
G_M33471_IG08:
sub rax, rdi
shr rax, 1
add eax, edx
jmp G_M33471_IG19
- align [0 bytes for IG13]
- ;; size=13 bbWeight=0.50 PerfScore 1.50
+ align [4 bytes for IG13]
+ ;; size=17 bbWeight=0.50 PerfScore 1.50
G_M33471_IG09:
sub rax, rdi
shr rax, 1
vpermq ymm0, ymm3, -40
vpmovmskb edi, ymm0
xor ecx, ecx
tzcnt ecx, edi
add eax, ecx
jmp G_M33471_IG19
;; size=29 bbWeight=0.50 PerfScore 5.12
G_M33471_IG10:
xor r9d, r9d
cmp r8d, 4
jl G_M33471_IG11
add r8d, -4
movsx rax, word ptr [rdi]
movsx r10, si
cmp eax, r10d
je G_M33471_IG24
movsx r9, dx
cmp eax, r9d
je G_M33471_IG24
movsx r9, cx
cmp eax, r9d
sete al
movzx rax, al
test al, al
jne G_M33471_IG24
movsx rax, word ptr [rdi+0x02]
cmp eax, r10d
je G_M33471_IG22
movsx r9, dx
cmp eax, r9d
je G_M33471_IG22
movsx r9, cx
cmp eax, r9d
sete al
movzx rax, al
test al, al
jne G_M33471_IG22
movsx rax, word ptr [rdi+0x04]
cmp eax, r10d
je G_M33471_IG20
movsx r9, dx
cmp eax, r9d
je G_M33471_IG20
movsx r9, cx
cmp eax, r9d
sete al
movzx rax, al
test al, al
jne SHORT G_M33471_IG20
movsx rax, word ptr [rdi+0x06]
cmp eax, r10d
je SHORT G_M33471_IG18
movsx r10, dx
cmp eax, r10d
je SHORT G_M33471_IG18
movsx r9, cx
cmp eax, r9d
sete al
movzx rax, al
test al, al
jne SHORT G_M33471_IG18
mov r9d, 4
;; size=202 bbWeight=0.50 PerfScore 20.62
G_M33471_IG11:
test r8d, r8d
jle SHORT G_M33471_IG16
;; size=5 bbWeight=0.50 PerfScore 0.62
G_M33471_IG12:
movsx r10, si
;; size=4 bbWeight=0.25 PerfScore 0.06
G_M33471_IG13:
dec r8d
movsx rax, word ptr [rdi+2*r9]
cmp eax, r10d
je SHORT G_M33471_IG17
;; size=13 bbWeight=4 PerfScore 22.00
G_M33471_IG14:
movsx rsi, dx
cmp eax, esi
je SHORT G_M33471_IG17
movsx rsi, cx
cmp eax, esi
sete al
movzx rax, al
test al, al
jne SHORT G_M33471_IG17
;; size=24 bbWeight=2 PerfScore 9.00
G_M33471_IG15:
inc r9
test r8d, r8d
jg SHORT G_M33471_IG13
;; size=8 bbWeight=4 PerfScore 6.00
G_M33471_IG16:
mov eax, -1
jmp SHORT G_M33471_IG19
;; size=7 bbWeight=0.50 PerfScore 1.12
G_M33471_IG17:
mov eax, r9d
jmp SHORT G_M33471_IG19
;; size=5 bbWeight=0.50 PerfScore 1.12
G_M33471_IG18:
mov eax, 3
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M33471_IG19:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M33471_IG20:
mov eax, 2
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M33471_IG21:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M33471_IG22:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M33471_IG23:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M33471_IG24:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M33471_IG25:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
-; Total bytes of code 685, prolog size 4, PerfScore 184.27, instruction count 195, allocated bytes for code 685 (MethodHash=2ec77d40) for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (FullOpts)
+; Total bytes of code 670, prolog size 4, PerfScore 183.52, instruction count 189, allocated bytes for code 670 (MethodHash=2ec77d40) for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (FullOpts)
; ============================================================ -12 (-1.85 % of base) - System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort) ; Assembly listing for method System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 5 single block inlinees; 3 inlinees without PGO data
; Final local variable assignments
;
;* V00 arg0 [V00 ] ( 0, 0 ) struct (16) zero-ref multireg-arg ld-addr-op single-def <System.ReadOnlySpan`1[ushort]>
; V01 arg1 [V01,T00] ( 17,140 ) byref -> rbx single-def
; V02 arg2 [V02,T20] ( 4, 3 ) ushort -> rcx single-def
; V03 arg3 [V03,T17] ( 5, 5 ) ushort -> r15 single-def
; V04 arg4 [V04,T18] ( 5, 5 ) ushort -> r14 single-def
; V05 loc0 [V05,T12] ( 5, 18 ) long -> r13
; V06 loc1 [V06,T01] ( 16, 87 ) long -> r12
; V07 loc2 [V07,T13] ( 4, 13 ) byref -> [rbp-0xE0] spill-single-def
;* V08 loc3 [V08 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V09 loc4 [V09 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V10 loc5 [V10 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V11 loc6 [V11 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V12 loc7 [V12 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V13 loc8 [V13 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V14 loc9 [V14 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V15 loc10 [V15 ] ( 0, 0 ) long -> zero-ref
;* V16 loc11 [V16 ] ( 0, 0 ) int -> zero-ref
; V17 loc12 [V17,T29] ( 2, 4.50) simd32 -> [rbp-0x50] spill-single-def <System.Runtime.Intrinsics.Vector256`1[ushort]>
; V18 loc13 [V18,T30] ( 2, 4.50) simd32 -> [rbp-0x70] spill-single-def <System.Runtime.Intrinsics.Vector256`1[ushort]>
; V19 loc14 [V19,T31] ( 2, 4.50) simd32 -> [rbp-0x90] spill-single-def <System.Runtime.Intrinsics.Vector256`1[ushort]>
;* V20 loc15 [V20 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ushort]>
;* V21 loc16 [V21 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ushort]>
;* V22 loc17 [V22 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ushort]>
; V23 loc18 [V23,T27] ( 4, 14 ) simd32 -> mm3 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V24 loc19 [V24,T02] ( 5, 66 ) int -> [rbp-0x94]
;* V25 loc20 [V25 ] ( 0, 0 ) int -> zero-ref
; V26 loc21 [V26,T32] ( 2, 4.50) simd16 -> [rbp-0xB0] spill-single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V27 loc22 [V27,T33] ( 2, 4.50) simd16 -> [rbp-0xC0] spill-single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V28 loc23 [V28,T34] ( 2, 4.50) simd16 -> [rbp-0xD0] spill-single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V29 loc24 [V29 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V30 loc25 [V30 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V31 loc26 [V31 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V32 loc27 [V32,T28] ( 4, 14 ) simd16 -> mm3 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V33 loc28 [V33,T03] ( 5, 66 ) int -> [rbp-0xD4]
;* V34 loc29 [V34 ] ( 0, 0 ) int -> zero-ref
; V35 loc30 [V35,T14] ( 4, 12 ) ushort -> rdi
;# V36 OutArgs [V36 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V37 tmp1 [V37,T25] ( 4, 32 ) simd16 -> mm3 "dup spill"
; V38 tmp2 [V38,T26] ( 4, 32 ) simd32 -> mm3 "dup spill"
;* V39 tmp3 [V39 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ushort]>
; V40 tmp4 [V40,T06] ( 4, 48 ) int -> r8 "Inline stloc first use temp"
;* V41 tmp5 [V41 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inline stloc first use temp" <System.Span`1[int]>
; V42 tmp6 [V42,T04] ( 3, 64 ) int -> rsi "Inlining Arg"
; V43 tmp7 [V43,T07] ( 4, 48 ) int -> r8 "Inline stloc first use temp"
;* V44 tmp8 [V44 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inline stloc first use temp" <System.Span`1[int]>
; V45 tmp9 [V45,T05] ( 3, 64 ) int -> rsi "Inlining Arg"
; V46 tmp10 [V46,T16] ( 4, 8 ) int -> rcx "Inline stloc first use temp"
;* V47 tmp11 [V47 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inline stloc first use temp" <System.Span`1[int]>
; V48 tmp12 [V48,T15] ( 3, 12 ) int -> rsi "Inlining Arg"
; V49 tmp13 [V49,T21] ( 2, 2 ) byref -> rdi single-def "field V00._reference (fldOffset=0x0)" P-INDEP
; V50 tmp14 [V50,T23] ( 2, 2 ) int -> rsi single-def "field V00._length (fldOffset=0x8)" P-INDEP
;* V51 tmp15 [V51 ] ( 0, 0 ) simd32 -> zero-ref "field V08._lower (fldOffset=0x0)" P-INDEP
;* V52 tmp16 [V52 ] ( 0, 0 ) simd32 -> zero-ref "field V08._upper (fldOffset=0x20)" P-INDEP
;* V53 tmp17 [V53 ] ( 0, 0 ) simd32 -> zero-ref "field V09._lower (fldOffset=0x0)" P-INDEP
;* V54 tmp18 [V54 ] ( 0, 0 ) simd32 -> zero-ref "field V09._upper (fldOffset=0x20)" P-INDEP
;* V55 tmp19 [V55 ] ( 0, 0 ) simd32 -> zero-ref "field V10._lower (fldOffset=0x0)" P-INDEP
;* V56 tmp20 [V56 ] ( 0, 0 ) simd32 -> zero-ref "field V10._upper (fldOffset=0x20)" P-INDEP
;* V57 tmp21 [V57 ] ( 0, 0 ) simd32 -> zero-ref "field V11._lower (fldOffset=0x0)" P-INDEP
;* V58 tmp22 [V58 ] ( 0, 0 ) simd32 -> zero-ref "field V11._upper (fldOffset=0x20)" P-INDEP
;* V59 tmp23 [V59 ] ( 0, 0 ) simd32 -> zero-ref "field V12._lower (fldOffset=0x0)" P-INDEP
;* V60 tmp24 [V60 ] ( 0, 0 ) simd32 -> zero-ref "field V12._upper (fldOffset=0x20)" P-INDEP
;* V61 tmp25 [V61 ] ( 0, 0 ) simd32 -> zero-ref "field V13._lower (fldOffset=0x0)" P-INDEP
;* V62 tmp26 [V62 ] ( 0, 0 ) simd32 -> zero-ref "field V13._upper (fldOffset=0x20)" P-INDEP
;* V63 tmp27 [V63 ] ( 0, 0 ) simd32 -> zero-ref "field V14._lower (fldOffset=0x0)" P-INDEP
;* V64 tmp28 [V64 ] ( 0, 0 ) simd32 -> zero-ref "field V14._upper (fldOffset=0x20)" P-INDEP
;* V65 tmp29 [V65 ] ( 0, 0 ) byref -> zero-ref single-def "field V39._reference (fldOffset=0x0)" P-INDEP
;* V66 tmp30 [V66 ] ( 0, 0 ) int -> zero-ref "field V39._length (fldOffset=0x8)" P-INDEP
; V67 tmp31 [V67,T10] ( 2, 24 ) byref -> r9 "field V41._reference (fldOffset=0x0)" P-INDEP
; V68 tmp32 [V68,T08] ( 2, 32 ) int -> rdi "field V41._length (fldOffset=0x8)" P-INDEP
; V69 tmp33 [V69,T11] ( 2, 24 ) byref -> r9 "field V44._reference (fldOffset=0x0)" P-INDEP
; V70 tmp34 [V70,T09] ( 2, 32 ) int -> rdi "field V44._length (fldOffset=0x8)" P-INDEP
; V71 tmp35 [V71,T22] ( 2, 4 ) byref -> r8 "field V47._reference (fldOffset=0x0)" P-INDEP
; V72 tmp36 [V72,T24] ( 2, 4 ) int -> rdi "field V47._length (fldOffset=0x8)" P-INDEP
; V73 cse0 [V73,T19] ( 5, 6 ) int -> [rbp-0xD8] multi-def "CSE #01: moderate"
;
; Lcl frame size = 184
G_M10293_IG01:
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 184
lea rbp, [rsp+0xE0]
mov rbx, rdx
mov r15d, r8d
mov r14d, r9d
;; size=34 bbWeight=1 PerfScore 7.50
G_M10293_IG02:
mov r13d, esi
xor r12d, r12d
mov rax, rdi
mov bword ptr [rbp-0xE0], rax
cmp r13, 32
jae G_M10293_IG12
;; size=26 bbWeight=1 PerfScore 3.00
G_M10293_IG03:
movzx rdx, cx
mov dword ptr [rbp-0xD8], edx
vmovd xmm0, edx
vpbroadcastw xmm0, xmm0
vmovaps xmmword ptr [rbp-0xB0], xmm0
- movzx rdi, r15w
- vmovd xmm1, edi
+ vmovd xmm1, r15d
vpbroadcastw xmm1, xmm1
vmovaps xmmword ptr [rbp-0xC0], xmm1
- movzx rdi, r14w
- vmovd xmm2, edi
+ vmovd xmm2, r14d
vpbroadcastw xmm2, xmm2
vmovaps xmmword ptr [rbp-0xD0], xmm2
jmp SHORT G_M10293_IG05
- ;; size=70 bbWeight=0.50 PerfScore 7.88
+ ;; size=64 bbWeight=0.50 PerfScore 7.62
G_M10293_IG04:
mov rax, bword ptr [rbp-0xE0]
;; size=7 bbWeight=2 PerfScore 2.00
G_M10293_IG05:
vmovups xmm3, xmmword ptr [rax+2*r12]
vpcmpeqw xmm4, xmm3, xmm0
vpcmpeqw xmm5, xmm3, xmm1
vpor xmm4, xmm5, xmm4
vpcmpeqw xmm3, xmm3, xmm2
vpor xmm3, xmm3, xmm4
vptest xmm3, xmm3
je SHORT G_M10293_IG11
;; size=33 bbWeight=4 PerfScore 40.67
G_M10293_IG06:
vpmovmskb ecx, xmm3
and ecx, 0x5555
;; size=10 bbWeight=2 PerfScore 4.50
G_M10293_IG07:
mov dword ptr [rbp-0xD4], ecx
xor edi, edi
tzcnt edi, ecx
shr edi, 1
mov esi, edi
add esi, r12d
mov r8d, dword ptr [rbx+0x08]
mov r9, bword ptr [rbx+0x10]
mov edi, dword ptr [rbx+0x18]
cmp r8d, edi
jb SHORT G_M10293_IG09
;; size=35 bbWeight=16 PerfScore 184.00
G_M10293_IG08:
mov rdi, rbx
mov r8, 0xD1FFAB1E ; code for System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
call [r8]System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
jmp SHORT G_M10293_IG10
;; size=18 bbWeight=8 PerfScore 44.00
G_M10293_IG09:
mov edi, r8d
mov dword ptr [r9+4*rdi], esi
inc r8d
mov dword ptr [rbx+0x08], r8d
;; size=14 bbWeight=8 PerfScore 20.00
G_M10293_IG10:
blsr ecx, dword ptr [rbp-0xD4]
jne SHORT G_M10293_IG07
;; size=11 bbWeight=16 PerfScore 48.00
G_M10293_IG11:
add r12, 8
lea rdi, [r13-0x08]
cmp r12, rdi
vmovaps xmm0, xmmword ptr [rbp-0xB0]
vmovaps xmm1, xmmword ptr [rbp-0xC0]
vmovaps xmm2, xmmword ptr [rbp-0xD0]
jbe G_M10293_IG04
jmp G_M10293_IG22
;; size=46 bbWeight=4 PerfScore 52.00
G_M10293_IG12:
movzx rdx, cx
mov dword ptr [rbp-0xD8], edx
vmovd xmm0, edx
vpbroadcastw ymm0, ymm0
vmovups ymmword ptr [rbp-0x50], ymm0
- movzx rdi, r15w
- vmovd xmm1, edi
+ vmovd xmm1, r15d
vpbroadcastw ymm1, ymm1
vmovups ymmword ptr [rbp-0x70], ymm1
- movzx rdi, r14w
- vmovd xmm2, edi
+ vmovd xmm2, r14d
vpbroadcastw ymm2, ymm2
vmovups ymmword ptr [rbp-0x90], ymm2
jmp SHORT G_M10293_IG14
- ;; size=64 bbWeight=0.50 PerfScore 9.38
+ ;; size=58 bbWeight=0.50 PerfScore 9.12
G_M10293_IG13:
mov rax, bword ptr [rbp-0xE0]
;; size=7 bbWeight=2 PerfScore 2.00
G_M10293_IG14:
vmovups ymm3, ymmword ptr [rax+2*r12]
vpcmpeqw ymm4, ymm3, ymm0
vpcmpeqw ymm5, ymm3, ymm1
vpor ymm4, ymm5, ymm4
vpcmpeqw ymm3, ymm3, ymm2
vpor ymm3, ymm3, ymm4
vptest ymm3, ymm3
je SHORT G_M10293_IG20
;; size=33 bbWeight=4 PerfScore 52.67
G_M10293_IG15:
vpmovmskb ecx, ymm3
and ecx, 0xD1FFAB1E
;; size=10 bbWeight=2 PerfScore 6.50
G_M10293_IG16:
mov dword ptr [rbp-0x94], ecx
xor edi, edi
tzcnt edi, ecx
shr edi, 1
mov esi, edi
add esi, r12d
mov r8d, dword ptr [rbx+0x08]
mov r9, bword ptr [rbx+0x10]
mov edi, dword ptr [rbx+0x18]
cmp r8d, edi
jb SHORT G_M10293_IG18
;; size=35 bbWeight=16 PerfScore 184.00
G_M10293_IG17:
mov rdi, rbx
mov r8, 0xD1FFAB1E ; code for System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
call [r8]System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
jmp SHORT G_M10293_IG19
;; size=18 bbWeight=8 PerfScore 44.00
G_M10293_IG18:
mov edi, r8d
mov dword ptr [r9+4*rdi], esi
inc r8d
mov dword ptr [rbx+0x08], r8d
;; size=14 bbWeight=8 PerfScore 20.00
G_M10293_IG19:
blsr ecx, dword ptr [rbp-0x94]
jne SHORT G_M10293_IG16
;; size=11 bbWeight=16 PerfScore 48.00
G_M10293_IG20:
add r12, 16
lea rdi, [r13-0x10]
cmp r12, rdi
vmovups ymm0, ymmword ptr [rbp-0x50]
vmovups ymm1, ymmword ptr [rbp-0x70]
vmovups ymm2, ymmword ptr [rbp-0x90]
jbe G_M10293_IG13
jmp SHORT G_M10293_IG22
;; size=37 bbWeight=4 PerfScore 64.00
G_M10293_IG21:
inc r12
;; size=3 bbWeight=4 PerfScore 1.00
G_M10293_IG22:
cmp r12, r13
jae SHORT G_M10293_IG27
;; size=5 bbWeight=8 PerfScore 10.00
G_M10293_IG23:
mov rax, bword ptr [rbp-0xE0]
movzx rdi, word ptr [rax+2*r12]
mov edx, dword ptr [rbp-0xD8]
cmp edi, edx
je SHORT G_M10293_IG25
;; size=22 bbWeight=4 PerfScore 21.00
G_M10293_IG24:
movzx rsi, r15w
cmp edi, esi
je SHORT G_M10293_IG25
movzx rsi, r14w
cmp edi, esi
jne SHORT G_M10293_IG21
;; size=16 bbWeight=2 PerfScore 6.00
G_M10293_IG25:
mov esi, r12d
mov ecx, dword ptr [rbx+0x08]
mov r8, bword ptr [rbx+0x10]
mov edi, dword ptr [rbx+0x18]
cmp ecx, edi
jb SHORT G_M10293_IG26
mov rdi, rbx
mov rcx, 0xD1FFAB1E ; code for System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
call [rcx]System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
jmp SHORT G_M10293_IG21
;; size=34 bbWeight=2 PerfScore 26.00
G_M10293_IG26:
mov edi, ecx
mov dword ptr [r8+4*rdi], esi
inc ecx
mov dword ptr [rbx+0x08], ecx
jmp SHORT G_M10293_IG21
;; size=13 bbWeight=2 PerfScore 9.00
G_M10293_IG27:
vzeroupper
add rsp, 184
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
;; size=21 bbWeight=1 PerfScore 5.25
-; Total bytes of code 647, prolog size 34, PerfScore 922.33, instruction count 163, allocated bytes for code 647 (MethodHash=8ebed7ca) for method System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort) (FullOpts)
+; Total bytes of code 635, prolog size 34, PerfScore 921.83, instruction count 159, allocated bytes for code 635 (MethodHash=8ebed7ca) for method System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort) (FullOpts)
; ============================================================ -8 (-16.33 % of base) - System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short] ; Assembly listing for method System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) short -> rsi single-def
; V02 arg1 [V02,T02] ( 3, 3 ) short -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M54365_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M54365_IG02:
- movsx rax, dx
- vmovd xmm0, eax
+ vmovd xmm0, edx
vpbroadcastw ymm0, ymm0
vpmullw ymm0, ymm0, ymmword ptr [reloc @RWD00]
- movsx rax, si
- vmovd xmm1, eax
+ vmovd xmm1, esi
vpbroadcastw ymm1, ymm1
vpaddw ymm0, ymm1, ymm0
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=45 bbWeight=1 PerfScore 17.08
+ ;; size=37 bbWeight=1 PerfScore 16.58
G_M54365_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
RWD00 dq 0003000200010000h, 0007000600050004h, 000B000A00090008h, 000F000E000D000Ch
-; Total bytes of code 49, prolog size 0, PerfScore 19.08, instruction count 12, allocated bytes for code 49 (MethodHash=28852ba2) for method System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 41, prolog size 0, PerfScore 18.58, instruction count 10, allocated bytes for code 41 (MethodHash=28852ba2) for method System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short] (FullOpts)
; ============================================================ -8 (-17.39 % of base) - System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short] ; Assembly listing for method System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) short -> rsi single-def
; V02 arg1 [V02,T02] ( 3, 3 ) short -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M32125_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M32125_IG02:
- movsx rax, dx
- vmovd xmm0, eax
+ vmovd xmm0, edx
vpbroadcastw xmm0, xmm0
vpmullw xmm0, xmm0, xmmword ptr [reloc @RWD00]
- movsx rax, si
- vmovd xmm1, eax
+ vmovd xmm1, esi
vpbroadcastw xmm1, xmm1
vpaddw xmm0, xmm1, xmm0
vmovups xmmword ptr [rdi], xmm0
mov rax, rdi
- ;; size=45 bbWeight=1 PerfScore 15.08
+ ;; size=37 bbWeight=1 PerfScore 14.58
G_M32125_IG03:
ret
;; size=1 bbWeight=1 PerfScore 1.00
RWD00 dq 0003000200010000h, 0007000600050004h
-; Total bytes of code 46, prolog size 0, PerfScore 16.08, instruction count 11, allocated bytes for code 46 (MethodHash=a9108282) for method System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short] (FullOpts)
+; Total bytes of code 38, prolog size 0, PerfScore 15.58, instruction count 9, allocated bytes for code 38 (MethodHash=a9108282) for method System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short] (FullOpts)
; ============================================================ -8 (-16.33 % of base) - System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short] ; Assembly listing for method System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) short -> rsi single-def
; V02 arg1 [V02,T02] ( 3, 3 ) short -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M9853_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M9853_IG02:
- movsx rax, dx
- vmovd xmm0, eax
+ vmovd xmm0, edx
vpbroadcastw ymm0, ymm0
vpmullw ymm0, ymm0, ymmword ptr [reloc @RWD00]
- movsx rax, si
- vmovd xmm1, eax
+ vmovd xmm1, esi
vpbroadcastw ymm1, ymm1
vpaddw ymm0, ymm1, ymm0
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=45 bbWeight=1 PerfScore 17.08
+ ;; size=37 bbWeight=1 PerfScore 16.58
G_M9853_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
RWD00 dq 0003000200010000h, 0007000600050004h, 000B000A00090008h, 000F000E000D000Ch
-; Total bytes of code 49, prolog size 0, PerfScore 19.08, instruction count 12, allocated bytes for code 49 (MethodHash=587dd982) for method System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short] (FullOpts)
+; Total bytes of code 41, prolog size 0, PerfScore 18.58, instruction count 10, allocated bytes for code 41 (MethodHash=587dd982) for method System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short] (FullOpts)
; ============================================================ -7 (-6.09 % of base) - System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte] ; Assembly listing for method System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 2 inlinees without PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T02] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T03] ( 3, 3 ) ubyte -> rsi single-def
; V02 arg1 [V02,T04] ( 3, 2.25) ubyte -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V04 tmp1 [V04 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V05 tmp2 [V05 ] ( 2, 5 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ubyte]>
; V06 tmp3 [V06,T00] ( 5, 16.25) int -> rax "Inline stloc first use temp"
; V07 tmp4 [V07 ] ( 2, 8.50) simd32 -> [rbp-0x50] do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
; V08 tmp5 [V08 ] ( 2, 8.50) simd32 -> [rbp-0x70] do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
;* V09 tmp6 [V09 ] ( 0, 0 ) int -> zero-ref "impAppendStmt"
;* V10 tmp7 [V10 ] ( 0, 0 ) ubyte -> zero-ref "Inline stloc first use temp"
;* V11 tmp8 [V11 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V12 tmp9 [V12 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V13 tmp10 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
; V14 cse0 [V14,T01] ( 4, 16 ) long -> rdx "CSE #01: aggressive"
;
; Lcl frame size = 112
G_M16765_IG01:
push rbp
sub rsp, 112
lea rbp, [rsp+0x70]
;; size=10 bbWeight=0.25 PerfScore 0.44
G_M16765_IG02:
vmovups ymm0, ymmword ptr [reloc @RWD00]
vmovups ymmword ptr [rbp-0x50], ymm0
- movzx rax, dl
- vmovd xmm0, eax
+ vmovd xmm0, edx
vpbroadcastb ymm0, ymm0
vmovups ymmword ptr [rbp-0x70], ymm0
xor eax, eax
align [0 bytes for IG03]
- ;; size=32 bbWeight=0.25 PerfScore 2.62
+ ;; size=29 bbWeight=0.25 PerfScore 2.56
G_M16765_IG03:
lea rcx, [rbp-0x50]
movsxd rdx, eax
movzx rcx, byte ptr [rcx+rdx]
lea r8, [rbp-0x70]
movzx r8, byte ptr [r8+rdx]
imul ecx, r8d
lea r8, [rbp-0x30]
mov byte ptr [r8+rdx], cl
inc eax
cmp eax, 32
jl SHORT G_M16765_IG03
;; size=39 bbWeight=4 PerfScore 41.00
G_M16765_IG04:
- movzx rax, sil
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
vpaddb ymm0, ymm0, ymmword ptr [rbp-0x30]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=25 bbWeight=1 PerfScore 8.50
+ ;; size=21 bbWeight=1 PerfScore 8.25
G_M16765_IG05:
vzeroupper
add rsp, 112
pop rbp
ret
;; size=9 bbWeight=1 PerfScore 2.75
RWD00 dq 0706050403020100h, 0F0E0D0C0B0A0908h, 1716151413121110h, 1F1E1D1C1B1A1918h
-; Total bytes of code 115, prolog size 10, PerfScore 55.31, instruction count 32, allocated bytes for code 115 (MethodHash=2b2fbe82) for method System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
+; Total bytes of code 108, prolog size 10, PerfScore 55.00, instruction count 30, allocated bytes for code 108 (MethodHash=2b2fbe82) for method System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
; ============================================================ Larger list of diffs: https://gist.github.com/MihuBot/cac97d05b7a0f61201b054be0f3e61df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Job completed in 19 minutes 21 seconds.
dotnet/runtime#108824
Diffs
Diffs
Artifacts:
The text was updated successfully, but these errors were encountered: