-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Arm64] Implement LoadPairVector64 and LoadPairVector128 #52424
[Arm64] Implement LoadPairVector64 and LoadPairVector128 #52424
Conversation
Note regarding the This serves as a reminder for when your PR is modifying a ref *.cs file and adding/modifying public APIs, to please make sure the API implementation in the src *.cs file is documented with triple slash comments, so the PR reviewers can sign off that change. |
Tagging subscribers to this area: @tannergooding Issue DetailsAnd use System.Text.ASCIIUtility:GetIndexOfFirstNonAsciiByte_Intrinsified(long,long):long@@ -103,9 +106,7 @@ G_M41550_IG03:
sub x3, x3, #32
;; bbWeight=0.50 PerfScore 8.75
G_M41550_IG04:
- ld1 {v17.16b}, [x0]
- add x4, x0, #16
- ld1 {v18.16b}, [x4]
+ ldp q17, q18, [x0]
sshr v17.16b, v17.16b, #7
and v17.16b, v17.16b, v16.16b
addp v17.16b, v17.16b, v17.16b
@@ -127,7 +128,7 @@ G_M41550_IG04:
add x0, x0, #32
cmp x0, x3
bls G_M41550_IG04
- ;; bbWeight=4 PerfScore 102.00
+ ;; bbWeight=4 PerfScore 88.00
G_M41550_IG05:
mov w4, w1
tbz w4, #4, G_M41550_IG07
@@ -245,9 +246,9 @@ G_M41550_IG19:
RWD00 dq 1001100110011001h, 1001100110011001h System.Collections.BitArray:CopyTo(Array,int):this@@ -419,29 +418,22 @@ G_M19747_IG16:
ldr x0, [x0]
ldr q16, [x0,#8]
mov v10.d[1], v13.d[0]
- and v16.16b, v10.16b, v16.16b
+ and v17.16b, v10.16b, v16.16b
mov v8.d[1], v12.d[0]
+ umin v17.16b, v17.16b, v8.16b
+ mov v9.d[1], v11.d[0]
+ zip2 v18.16b, v9.16b, v9.16b
+ and v16.16b, v18.16b, v16.16b
umin v16.16b, v16.16b, v8.16b
mov w0, w21
add x0, x25, x0
- st1 {v16.16b}, [x0]
- mov v9.d[1], v11.d[0]
- zip2 v16.16b, v9.16b, v9.16b
- movz x1, #0xd1ffab1e
- movk x1, #0xd1ffab1e LSL #16
- movk x1, #0xd1ffab1e LSL #32
- ldr x1, [x1]
- ldr q17, [x1,#8]
- and v16.16b, v16.16b, v17.16b
- umin v16.16b, v16.16b, v8.16b
- add x0, x0, #16
- st1 {v16.16b}, [x0]
+ stp q17, q16, [x0]
add w21, w21, #32
add w0, w21, #32
ldr w1, [x19,#16]
cmp w0, w1
bls G_M19747_IG16
- ;; bbWeight=4 PerfScore 212.00
+ ;; bbWeight=4 PerfScore 176.00
G_M19747_IG17:
mov x0, #0
str x0, [fp,#16] // [V40 loc37]
@@ -648,10 +640,10 @@ RWD00 dd G_M19747_IG14 - G_M19747_IG02
dd G_M19747_IG12 - G_M19747_IG02 System.Collections.BitArray:.ctor(ref):this-; Lcl frame size = 16
+; Lcl frame size = 8
G_M47086_IG01:
stp fp, lr, [sp,#-112]!
- stp d8, d9, [sp,#32]
- stp d10, d11, [sp,#48]
- stp x19, x20, [sp,#64]
- stp x21, x22, [sp,#80]
- stp x23, x24, [sp,#96]
+ stp d8, d9, [sp,#24]
+ stp d10, d11, [sp,#40]
+ stp d12, d13, [sp,#56]
+ stp x19, x20, [sp,#72]
+ stp x21, x22, [sp,#88]
+ str x23, [sp,#104]
mov fp, sp
- str xzr, [fp,#24] // [V05 loc3]
+ str xzr, [fp,#16] // [V05 loc3]
mov x19, x0
mov x20, x1
- ;; bbWeight=1 PerfScore 8.50
+ ;; bbWeight=1 PerfScore 9.50
G_M47086_IG02:
cbz x20, G_M47086_IG14
ldr w21, [x20,#8]
@@ -745,49 +740,50 @@ G_M47086_IG02:
;; bbWeight=1 PerfScore 14.50
G_M47086_IG03:
movi v8.4s, #0x00
- str x20, [fp,#24] // [V05 loc3]
- ldr x0, [fp,#24] // [V05 loc3]
+ str x20, [fp,#16] // [V05 loc3]
+ ldr x0, [fp,#16] // [V05 loc3]
ldr w0, [x0,#8]
cbnz w0, G_M47086_IG04
mov x23, #0
b G_M47086_IG06
;; bbWeight=0.50 PerfScore 4.50
G_M47086_IG04:
- ldr x0, [fp,#24] // [V05 loc3]
+ ldr x0, [fp,#16] // [V05 loc3]
ldr w0, [x0,#8]
cmp w0, #0
bls G_M47086_IG15
- ldr x0, [fp,#24] // [V05 loc3]
+ ldr x0, [fp,#16] // [V05 loc3]
add x23, x0, #16
cmp w21, #32
blo G_M47086_IG07
;; bbWeight=0.50 PerfScore 5.25
G_M47086_IG05:
mov w0, w22
- add x24, x23, x0
- ld1 {v16.16b}, [x24]
- cmeq v9.16b, v16.16b, v8.16b
+ add x0, x23, x0
+ ldp q16, q17, [x0]
+ mov v9.16b, v17.16b
+ cmeq v10.16b, v16.16b, v8.16b
movz x0, #0xd1ffab1e
movk x0, #0xd1ffab1e LSL #16
movk x0, #0xd1ffab1e LSL #32
mov w1, #7
- mov v10.d[0], v8.d[1]
- mov v11.d[0], v9.d[1]
+ mov v11.d[0], v8.d[1]
+ mov v12.d[0], v10.d[1]
+ mov v13.d[0], v9.d[1]
bl CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
movz x0, #0xd1ffab1e
movk x0, #0xd1ffab1e LSL #16
movk x0, #0xd1ffab1e LSL #32
ldr x0, [x0]
ldr q16, [x0,#8]
- mov v9.d[1], v11.d[0]
- and v17.16b, v9.16b, v16.16b
+ mov v10.d[1], v12.d[0]
+ and v17.16b, v10.16b, v16.16b
addp v17.16b, v17.16b, v17.16b
addp v17.16b, v17.16b, v17.16b
addp v17.16b, v17.16b, v17.16b
- add x0, x24, #16
- ld1 {v18.16b}, [x0]
- mov v8.d[1], v10.d[0]
- cmeq v18.16b, v18.16b, v8.16b
+ mov v9.d[1], v13.d[0]
+ mov v8.d[1], v11.d[0]
+ cmeq v18.16b, v9.16b, v8.16b
and v16.16b, v18.16b, v16.16b
addp v16.16b, v16.16b, v16.16b
addp v16.16b, v16.16b, v16.16b
@@ -805,7 +801,7 @@ G_M47086_IG05:
mvn w0, w0
str w0, [x1, x2]
add w22, w22, #32
- ;; bbWeight=2 PerfScore 103.00
+ ;; bbWeight=2 PerfScore 101.00
G_M47086_IG06:
add w0, w22, #32
cmp w21, w0
@@ -813,7 +809,7 @@ G_M47086_IG06:
;; bbWeight=4 PerfScore 8.00
G_M47086_IG07:
mov x0, #0
- str x0, [fp,#24] // [V05 loc3]
+ str x0, [fp,#16] // [V05 loc3]
cmp w21, w22
bls G_M47086_IG11
;; bbWeight=0.50 PerfScore 1.50
@@ -851,14 +847,15 @@ G_M47086_IG11:
str wzr, [x19,#20]
;; bbWeight=1 PerfScore 1.00
G_M47086_IG12:
- ldp x23, x24, [sp,#96]
- ldp x21, x22, [sp,#80]
- ldp x19, x20, [sp,#64]
- ldp d10, d11, [sp,#48]
- ldp d8, d9, [sp,#32]
+ ldr x23, [sp,#104]
+ ldp x21, x22, [sp,#88]
+ ldp x19, x20, [sp,#72]
+ ldp d12, d13, [sp,#56]
+ ldp d10, d11, [sp,#40]
+ ldp d8, d9, [sp,#24]
ldp fp, lr, [sp],#112
ret lr
- ;; bbWeight=1 PerfScore 7.00
+ ;; bbWeight=1 PerfScore 9.00 System.Text.Encodings.Web.OptimizedInboxTextEncoder:GetIndexOfFirstCharToEncodeAdvSimd64(long,long):long:this@@ -69,10 +73,8 @@ G_M26779_IG03:
G_M26779_IG04:
lsl x6, x4, #1
add x6, x1, x6
- ld1 {v20.8h}, [x6]
+ ldp q20, q21, [x6]
sqxtun v20.8b, v20.8h
- add x6, x6, #16
- ld1 {v21.8h}, [x6]
sqxtun2 v20.16b, v21.8h
and v21.16b, v20.16b, v16.16b
tbl v21.16b, {v19.16b}, v21.16b
@@ -87,7 +89,7 @@ G_M26779_IG04:
add x4, x4, #16
cmp x4, x5
blo G_M26779_IG04
- ;; bbWeight=4 PerfScore 102.00
+ ;; bbWeight=4 PerfScore 88.00
G_M26779_IG05:
mov w5, w2
tbz w5, #3, G_M26779_IG06
@@ -179,8 +181,8 @@ RWD00 dq 8040201008040201h, 0000000000000000h
RWD16 dq F00FF00FF00FF00Fh, F00FF00FF00FF00Fh
|
seems to have a slight regression. Is keeping that method to continue using |
Agree, the regression is due to register movements around the call. I will update the changes after fixing the assertion that the CI testing has revealed. |
Draft Pull Request was automatically closed for inactivity. Please let us know if you'd like to reopen it. |
…formNotSupported.cs
…dvSimd.cs AdvSimd.PlatformNotSupported.cs
…elpers.cs src/tests/JIT/HardwareIntrinsics/Arm/Shared/Helpers.tt
…alues in multiple registers in lsra.h lsraarm64.cpp lsraxarch.cpp
…ccodegenarm64.cpp
…ic and always use BlkOpKindUnroll for such cases in lowerarmarch.cpp
…odegenarmarch.cpp
b78e415
to
fab6ef3
Compare
@imhameed @fanyang-mono @vargaz @naricc Do you have any guidance how to add support for Arm64 intrinsics returning |
It looks that LLVM already folds two consecutive SIMD loads into one SIMD |
Several options for generating an ldp: https://godbolt.org/z/svjEqavPj The annoying part would be fabricating a |
@echesakovMSFT here's an implementation of LoadPairVector for Mono that passes the tests you've added: imhameed@ec909cb The generated code is sometimes good and sometimes bad; we'll need to improve our calling convention and possibly improve our treatment of value types for more reliably good output. LLVM doesn't seem to have any support for lowering |
@imhameed Thanks you for the follow up and implementing support on Mono side!
Do you think we can take your changes to .NET 6.0? If so, can you please push the commit to this PR? I found some issues with CoreCLR implementation of multi-reg nodes. I hope I will resolve them by .NET 6 deadline and, in that case, we can merge both changes to CoreCLR and Mono as one PR. |
Yeah I think the Mono changes should be fine for .NET 6. I'll push it to your branch, and merging it as part of this PR sounds perfect to me. |
Fabricates a `ValueTuple<T, T>` for the result in a local alloca.
I don't think this will make to .NET 6. I found more issues with multi-register nodes implementation and moving this PR and the corresponding issue to future. |
Will re-open after I fix the issues. |
Tagging myself |
And use
AdvSimd.Arm64.StorePair
along with the newly implementedAdvSimd.Arm64.LoadPairVector128
in the following libraries methods:System.Text.ASCIIUtility:GetIndexOfFirstNonAsciiByte_Intrinsified(long,long):long
System.Collections.BitArray:CopyTo(Array,int):this
System.Collections.BitArray:.ctor(ref):this
System.Text.Encodings.Web.OptimizedInboxTextEncoder:GetIndexOfFirstCharToEncodeAdvSimd64(long,long):long:this
Fixes #39243