Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Arm64] Implement LoadPairVector64 and LoadPairVector128 #52424

Conversation

echesakov
Copy link
Contributor

@echesakov echesakov commented May 7, 2021

And use AdvSimd.Arm64.StorePair along with the newly implemented AdvSimd.Arm64.LoadPairVector128 in the following libraries methods:

System.Text.ASCIIUtility:GetIndexOfFirstNonAsciiByte_Intrinsified(long,long):long
@@ -103,9 +106,7 @@ G_M41550_IG03:
             sub     x3, x3, #32
                                                ;; bbWeight=0.50 PerfScore 8.75
 G_M41550_IG04:
-            ld1     {v17.16b}, [x0]
-            add     x4, x0, #16
-            ld1     {v18.16b}, [x4]
+            ldp     q17, q18, [x0]
             sshr    v17.16b, v17.16b, #7
             and     v17.16b, v17.16b, v16.16b
             addp    v17.16b, v17.16b, v17.16b
@@ -127,7 +128,7 @@ G_M41550_IG04:
             add     x0, x0, #32
             cmp     x0, x3
             bls     G_M41550_IG04
-                                               ;; bbWeight=4    PerfScore 102.00
+                                               ;; bbWeight=4    PerfScore 88.00
 G_M41550_IG05:
             mov     w4, w1
             tbz     w4, #4, G_M41550_IG07
@@ -245,9 +246,9 @@ G_M41550_IG19:
 RWD00          dq      1001100110011001h, 1001100110011001h
System.Collections.BitArray:CopyTo(Array,int):this
@@ -419,29 +418,22 @@ G_M19747_IG16:
             ldr     x0, [x0]
             ldr     q16, [x0,#8]
             mov     v10.d[1], v13.d[0]
-            and     v16.16b, v10.16b, v16.16b
+            and     v17.16b, v10.16b, v16.16b
             mov     v8.d[1], v12.d[0]
+            umin    v17.16b, v17.16b, v8.16b
+            mov     v9.d[1], v11.d[0]
+            zip2    v18.16b, v9.16b, v9.16b
+            and     v16.16b, v18.16b, v16.16b
             umin    v16.16b, v16.16b, v8.16b
             mov     w0, w21
             add     x0, x25, x0
-            st1     {v16.16b}, [x0]
-            mov     v9.d[1], v11.d[0]
-            zip2    v16.16b, v9.16b, v9.16b
-            movz    x1, #0xd1ffab1e
-            movk    x1, #0xd1ffab1e LSL #16
-            movk    x1, #0xd1ffab1e LSL #32
-            ldr     x1, [x1]
-            ldr     q17, [x1,#8]
-            and     v16.16b, v16.16b, v17.16b
-            umin    v16.16b, v16.16b, v8.16b
-            add     x0, x0, #16
-            st1     {v16.16b}, [x0]
+            stp     q17, q16, [x0]
             add     w21, w21, #32
             add     w0, w21, #32
             ldr     w1, [x19,#16]
             cmp     w0, w1
             bls     G_M19747_IG16
-                                               ;; bbWeight=4    PerfScore 212.00
+                                               ;; bbWeight=4    PerfScore 176.00
 G_M19747_IG17:
             mov     x0, #0
             str     x0, [fp,#16]       // [V40 loc37]
@@ -648,10 +640,10 @@ RWD00     dd      G_M19747_IG14 - G_M19747_IG02
                dd      G_M19747_IG12 - G_M19747_IG02
System.Collections.BitArray:.ctor(ref):this
-; Lcl frame size = 16
+; Lcl frame size = 8

 G_M47086_IG01:
             stp     fp, lr, [sp,#-112]!
-            stp     d8, d9, [sp,#32]
-            stp     d10, d11, [sp,#48]
-            stp     x19, x20, [sp,#64]
-            stp     x21, x22, [sp,#80]
-            stp     x23, x24, [sp,#96]
+            stp     d8, d9, [sp,#24]
+            stp     d10, d11, [sp,#40]
+            stp     d12, d13, [sp,#56]
+            stp     x19, x20, [sp,#72]
+            stp     x21, x22, [sp,#88]
+            str     x23, [sp,#104]
             mov     fp, sp
-            str     xzr, [fp,#24]      // [V05 loc3]
+            str     xzr, [fp,#16]      // [V05 loc3]
             mov     x19, x0
             mov     x20, x1
-                                               ;; bbWeight=1    PerfScore 8.50
+                                               ;; bbWeight=1    PerfScore 9.50
 G_M47086_IG02:
             cbz     x20, G_M47086_IG14
             ldr     w21, [x20,#8]
@@ -745,49 +740,50 @@ G_M47086_IG02:
                                                ;; bbWeight=1    PerfScore 14.50
 G_M47086_IG03:
             movi    v8.4s, #0x00
-            str     x20, [fp,#24]      // [V05 loc3]
-            ldr     x0, [fp,#24]       // [V05 loc3]
+            str     x20, [fp,#16]      // [V05 loc3]
+            ldr     x0, [fp,#16]       // [V05 loc3]
             ldr     w0, [x0,#8]
             cbnz    w0, G_M47086_IG04
             mov     x23, #0
             b       G_M47086_IG06
                                                ;; bbWeight=0.50 PerfScore 4.50
 G_M47086_IG04:
-            ldr     x0, [fp,#24]       // [V05 loc3]
+            ldr     x0, [fp,#16]       // [V05 loc3]
             ldr     w0, [x0,#8]
             cmp     w0, #0
             bls     G_M47086_IG15
-            ldr     x0, [fp,#24]       // [V05 loc3]
+            ldr     x0, [fp,#16]       // [V05 loc3]
             add     x23, x0, #16
             cmp     w21, #32
             blo     G_M47086_IG07
                                                ;; bbWeight=0.50 PerfScore 5.25
 G_M47086_IG05:
             mov     w0, w22
-            add     x24, x23, x0
-            ld1     {v16.16b}, [x24]
-            cmeq    v9.16b, v16.16b, v8.16b
+            add     x0, x23, x0
+            ldp     q16, q17, [x0]
+            mov     v9.16b, v17.16b
+            cmeq    v10.16b, v16.16b, v8.16b
             movz    x0, #0xd1ffab1e
             movk    x0, #0xd1ffab1e LSL #16
             movk    x0, #0xd1ffab1e LSL #32
             mov     w1, #7
-            mov     v10.d[0], v8.d[1]
-            mov     v11.d[0], v9.d[1]
+            mov     v11.d[0], v8.d[1]
+            mov     v12.d[0], v10.d[1]
+            mov     v13.d[0], v9.d[1]
             bl      CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
             movz    x0, #0xd1ffab1e
             movk    x0, #0xd1ffab1e LSL #16
             movk    x0, #0xd1ffab1e LSL #32
             ldr     x0, [x0]
             ldr     q16, [x0,#8]
-            mov     v9.d[1], v11.d[0]
-            and     v17.16b, v9.16b, v16.16b
+            mov     v10.d[1], v12.d[0]
+            and     v17.16b, v10.16b, v16.16b
             addp    v17.16b, v17.16b, v17.16b
             addp    v17.16b, v17.16b, v17.16b
             addp    v17.16b, v17.16b, v17.16b
-            add     x0, x24, #16
-            ld1     {v18.16b}, [x0]
-            mov     v8.d[1], v10.d[0]
-            cmeq    v18.16b, v18.16b, v8.16b
+            mov     v9.d[1], v13.d[0]
+            mov     v8.d[1], v11.d[0]
+            cmeq    v18.16b, v9.16b, v8.16b
             and     v16.16b, v18.16b, v16.16b
             addp    v16.16b, v16.16b, v16.16b
             addp    v16.16b, v16.16b, v16.16b
@@ -805,7 +801,7 @@ G_M47086_IG05:
             mvn     w0, w0
             str     w0, [x1, x2]
             add     w22, w22, #32
-                                               ;; bbWeight=2    PerfScore 103.00
+                                               ;; bbWeight=2    PerfScore 101.00
 G_M47086_IG06:
             add     w0, w22, #32
             cmp     w21, w0
@@ -813,7 +809,7 @@ G_M47086_IG06:
                                                ;; bbWeight=4    PerfScore 8.00
 G_M47086_IG07:
             mov     x0, #0
-            str     x0, [fp,#24]       // [V05 loc3]
+            str     x0, [fp,#16]       // [V05 loc3]
             cmp     w21, w22
             bls     G_M47086_IG11
                                                ;; bbWeight=0.50 PerfScore 1.50
@@ -851,14 +847,15 @@ G_M47086_IG11:
             str     wzr, [x19,#20]
                                                ;; bbWeight=1    PerfScore 1.00
 G_M47086_IG12:
-            ldp     x23, x24, [sp,#96]
-            ldp     x21, x22, [sp,#80]
-            ldp     x19, x20, [sp,#64]
-            ldp     d10, d11, [sp,#48]
-            ldp     d8, d9, [sp,#32]
+            ldr     x23, [sp,#104]
+            ldp     x21, x22, [sp,#88]
+            ldp     x19, x20, [sp,#72]
+            ldp     d12, d13, [sp,#56]
+            ldp     d10, d11, [sp,#40]
+            ldp     d8, d9, [sp,#24]
             ldp     fp, lr, [sp],#112
             ret     lr
-                                               ;; bbWeight=1    PerfScore 7.00
+                                               ;; bbWeight=1    PerfScore 9.00
System.Text.Encodings.Web.OptimizedInboxTextEncoder:GetIndexOfFirstCharToEncodeAdvSimd64(long,long):long:this
@@ -69,10 +73,8 @@ G_M26779_IG03:
 G_M26779_IG04:
             lsl     x6, x4, #1
             add     x6, x1, x6
-            ld1     {v20.8h}, [x6]
+            ldp     q20, q21, [x6]
             sqxtun  v20.8b, v20.8h
-            add     x6, x6, #16
-            ld1     {v21.8h}, [x6]
             sqxtun2 v20.16b, v21.8h
             and     v21.16b, v20.16b, v16.16b
             tbl     v21.16b, {v19.16b}, v21.16b
@@ -87,7 +89,7 @@ G_M26779_IG04:
             add     x4, x4, #16
             cmp     x4, x5
             blo     G_M26779_IG04
-                                               ;; bbWeight=4    PerfScore 102.00
+                                               ;; bbWeight=4    PerfScore 88.00
 G_M26779_IG05:
             mov     w5, w2
             tbz     w5, #3, G_M26779_IG06
@@ -179,8 +181,8 @@ RWD00       dq      8040201008040201h, 0000000000000000h
 RWD16          dq      F00FF00FF00FF00Fh, F00FF00FF00FF00Fh

Fixes #39243

@dotnet-issue-labeler
Copy link

Note regarding the new-api-needs-documentation label:

This serves as a reminder for when your PR is modifying a ref *.cs file and adding/modifying public APIs, to please make sure the API implementation in the src *.cs file is documented with triple slash comments, so the PR reviewers can sign off that change.

@ghost
Copy link

ghost commented May 7, 2021

Tagging subscribers to this area: @tannergooding
See info in area-owners.md if you want to be subscribed.

Issue Details

And use AdvSimd.Arm64.StorePair along with the newly implemented AdvSimd.Arm64.LoadPairVector128 in the following libraries methods:

System.Text.ASCIIUtility:GetIndexOfFirstNonAsciiByte_Intrinsified(long,long):long
@@ -103,9 +106,7 @@ G_M41550_IG03:
             sub     x3, x3, #32
                                                ;; bbWeight=0.50 PerfScore 8.75
 G_M41550_IG04:
-            ld1     {v17.16b}, [x0]
-            add     x4, x0, #16
-            ld1     {v18.16b}, [x4]
+            ldp     q17, q18, [x0]
             sshr    v17.16b, v17.16b, #7
             and     v17.16b, v17.16b, v16.16b
             addp    v17.16b, v17.16b, v17.16b
@@ -127,7 +128,7 @@ G_M41550_IG04:
             add     x0, x0, #32
             cmp     x0, x3
             bls     G_M41550_IG04
-                                               ;; bbWeight=4    PerfScore 102.00
+                                               ;; bbWeight=4    PerfScore 88.00
 G_M41550_IG05:
             mov     w4, w1
             tbz     w4, #4, G_M41550_IG07
@@ -245,9 +246,9 @@ G_M41550_IG19:
 RWD00          dq      1001100110011001h, 1001100110011001h
System.Collections.BitArray:CopyTo(Array,int):this
@@ -419,29 +418,22 @@ G_M19747_IG16:
             ldr     x0, [x0]
             ldr     q16, [x0,#8]
             mov     v10.d[1], v13.d[0]
-            and     v16.16b, v10.16b, v16.16b
+            and     v17.16b, v10.16b, v16.16b
             mov     v8.d[1], v12.d[0]
+            umin    v17.16b, v17.16b, v8.16b
+            mov     v9.d[1], v11.d[0]
+            zip2    v18.16b, v9.16b, v9.16b
+            and     v16.16b, v18.16b, v16.16b
             umin    v16.16b, v16.16b, v8.16b
             mov     w0, w21
             add     x0, x25, x0
-            st1     {v16.16b}, [x0]
-            mov     v9.d[1], v11.d[0]
-            zip2    v16.16b, v9.16b, v9.16b
-            movz    x1, #0xd1ffab1e
-            movk    x1, #0xd1ffab1e LSL #16
-            movk    x1, #0xd1ffab1e LSL #32
-            ldr     x1, [x1]
-            ldr     q17, [x1,#8]
-            and     v16.16b, v16.16b, v17.16b
-            umin    v16.16b, v16.16b, v8.16b
-            add     x0, x0, #16
-            st1     {v16.16b}, [x0]
+            stp     q17, q16, [x0]
             add     w21, w21, #32
             add     w0, w21, #32
             ldr     w1, [x19,#16]
             cmp     w0, w1
             bls     G_M19747_IG16
-                                               ;; bbWeight=4    PerfScore 212.00
+                                               ;; bbWeight=4    PerfScore 176.00
 G_M19747_IG17:
             mov     x0, #0
             str     x0, [fp,#16]       // [V40 loc37]
@@ -648,10 +640,10 @@ RWD00     dd      G_M19747_IG14 - G_M19747_IG02
                dd      G_M19747_IG12 - G_M19747_IG02
System.Collections.BitArray:.ctor(ref):this
-; Lcl frame size = 16
+; Lcl frame size = 8

 G_M47086_IG01:
             stp     fp, lr, [sp,#-112]!
-            stp     d8, d9, [sp,#32]
-            stp     d10, d11, [sp,#48]
-            stp     x19, x20, [sp,#64]
-            stp     x21, x22, [sp,#80]
-            stp     x23, x24, [sp,#96]
+            stp     d8, d9, [sp,#24]
+            stp     d10, d11, [sp,#40]
+            stp     d12, d13, [sp,#56]
+            stp     x19, x20, [sp,#72]
+            stp     x21, x22, [sp,#88]
+            str     x23, [sp,#104]
             mov     fp, sp
-            str     xzr, [fp,#24]      // [V05 loc3]
+            str     xzr, [fp,#16]      // [V05 loc3]
             mov     x19, x0
             mov     x20, x1
-                                               ;; bbWeight=1    PerfScore 8.50
+                                               ;; bbWeight=1    PerfScore 9.50
 G_M47086_IG02:
             cbz     x20, G_M47086_IG14
             ldr     w21, [x20,#8]
@@ -745,49 +740,50 @@ G_M47086_IG02:
                                                ;; bbWeight=1    PerfScore 14.50
 G_M47086_IG03:
             movi    v8.4s, #0x00
-            str     x20, [fp,#24]      // [V05 loc3]
-            ldr     x0, [fp,#24]       // [V05 loc3]
+            str     x20, [fp,#16]      // [V05 loc3]
+            ldr     x0, [fp,#16]       // [V05 loc3]
             ldr     w0, [x0,#8]
             cbnz    w0, G_M47086_IG04
             mov     x23, #0
             b       G_M47086_IG06
                                                ;; bbWeight=0.50 PerfScore 4.50
 G_M47086_IG04:
-            ldr     x0, [fp,#24]       // [V05 loc3]
+            ldr     x0, [fp,#16]       // [V05 loc3]
             ldr     w0, [x0,#8]
             cmp     w0, #0
             bls     G_M47086_IG15
-            ldr     x0, [fp,#24]       // [V05 loc3]
+            ldr     x0, [fp,#16]       // [V05 loc3]
             add     x23, x0, #16
             cmp     w21, #32
             blo     G_M47086_IG07
                                                ;; bbWeight=0.50 PerfScore 5.25
 G_M47086_IG05:
             mov     w0, w22
-            add     x24, x23, x0
-            ld1     {v16.16b}, [x24]
-            cmeq    v9.16b, v16.16b, v8.16b
+            add     x0, x23, x0
+            ldp     q16, q17, [x0]
+            mov     v9.16b, v17.16b
+            cmeq    v10.16b, v16.16b, v8.16b
             movz    x0, #0xd1ffab1e
             movk    x0, #0xd1ffab1e LSL #16
             movk    x0, #0xd1ffab1e LSL #32
             mov     w1, #7
-            mov     v10.d[0], v8.d[1]
-            mov     v11.d[0], v9.d[1]
+            mov     v11.d[0], v8.d[1]
+            mov     v12.d[0], v10.d[1]
+            mov     v13.d[0], v9.d[1]
             bl      CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
             movz    x0, #0xd1ffab1e
             movk    x0, #0xd1ffab1e LSL #16
             movk    x0, #0xd1ffab1e LSL #32
             ldr     x0, [x0]
             ldr     q16, [x0,#8]
-            mov     v9.d[1], v11.d[0]
-            and     v17.16b, v9.16b, v16.16b
+            mov     v10.d[1], v12.d[0]
+            and     v17.16b, v10.16b, v16.16b
             addp    v17.16b, v17.16b, v17.16b
             addp    v17.16b, v17.16b, v17.16b
             addp    v17.16b, v17.16b, v17.16b
-            add     x0, x24, #16
-            ld1     {v18.16b}, [x0]
-            mov     v8.d[1], v10.d[0]
-            cmeq    v18.16b, v18.16b, v8.16b
+            mov     v9.d[1], v13.d[0]
+            mov     v8.d[1], v11.d[0]
+            cmeq    v18.16b, v9.16b, v8.16b
             and     v16.16b, v18.16b, v16.16b
             addp    v16.16b, v16.16b, v16.16b
             addp    v16.16b, v16.16b, v16.16b
@@ -805,7 +801,7 @@ G_M47086_IG05:
             mvn     w0, w0
             str     w0, [x1, x2]
             add     w22, w22, #32
-                                               ;; bbWeight=2    PerfScore 103.00
+                                               ;; bbWeight=2    PerfScore 101.00
 G_M47086_IG06:
             add     w0, w22, #32
             cmp     w21, w0
@@ -813,7 +809,7 @@ G_M47086_IG06:
                                                ;; bbWeight=4    PerfScore 8.00
 G_M47086_IG07:
             mov     x0, #0
-            str     x0, [fp,#24]       // [V05 loc3]
+            str     x0, [fp,#16]       // [V05 loc3]
             cmp     w21, w22
             bls     G_M47086_IG11
                                                ;; bbWeight=0.50 PerfScore 1.50
@@ -851,14 +847,15 @@ G_M47086_IG11:
             str     wzr, [x19,#20]
                                                ;; bbWeight=1    PerfScore 1.00
 G_M47086_IG12:
-            ldp     x23, x24, [sp,#96]
-            ldp     x21, x22, [sp,#80]
-            ldp     x19, x20, [sp,#64]
-            ldp     d10, d11, [sp,#48]
-            ldp     d8, d9, [sp,#32]
+            ldr     x23, [sp,#104]
+            ldp     x21, x22, [sp,#88]
+            ldp     x19, x20, [sp,#72]
+            ldp     d12, d13, [sp,#56]
+            ldp     d10, d11, [sp,#40]
+            ldp     d8, d9, [sp,#24]
             ldp     fp, lr, [sp],#112
             ret     lr
-                                               ;; bbWeight=1    PerfScore 7.00
+                                               ;; bbWeight=1    PerfScore 9.00
System.Text.Encodings.Web.OptimizedInboxTextEncoder:GetIndexOfFirstCharToEncodeAdvSimd64(long,long):long:this
@@ -69,10 +73,8 @@ G_M26779_IG03:
 G_M26779_IG04:
             lsl     x6, x4, #1
             add     x6, x1, x6
-            ld1     {v20.8h}, [x6]
+            ldp     q20, q21, [x6]
             sqxtun  v20.8b, v20.8h
-            add     x6, x6, #16
-            ld1     {v21.8h}, [x6]
             sqxtun2 v20.16b, v21.8h
             and     v21.16b, v20.16b, v16.16b
             tbl     v21.16b, {v19.16b}, v21.16b
@@ -87,7 +89,7 @@ G_M26779_IG04:
             add     x4, x4, #16
             cmp     x4, x5
             blo     G_M26779_IG04
-                                               ;; bbWeight=4    PerfScore 102.00
+                                               ;; bbWeight=4    PerfScore 88.00
 G_M26779_IG05:
             mov     w5, w2
             tbz     w5, #3, G_M26779_IG06
@@ -179,8 +181,8 @@ RWD00       dq      8040201008040201h, 0000000000000000h
 RWD16          dq      F00FF00FF00FF00Fh, F00FF00FF00FF00Fh
Author: echesakovMSFT
Assignees: -
Labels:

area-System.Runtime.Intrinsics, new-api-needs-documentation

Milestone: -

@echesakov echesakov added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label May 7, 2021
@echesakov echesakov self-assigned this May 7, 2021
@echesakov echesakov added this to the 6.0.0 milestone May 7, 2021
@am11
Copy link
Member

am11 commented May 7, 2021

System.Collections.BitArray:.ctor(ref):this

seems to have a slight regression. Is keeping that method to continue using LoadVector128 (no pair) better?

@echesakov
Copy link
Contributor Author

System.Collections.BitArray:.ctor(ref):this

seems to have a slight regression. Is keeping that method to continue using LoadVector128 (no pair) better?

Agree, the regression is due to register movements around the call. I will update the changes after fixing the assertion that the CI testing has revealed.

@ghost
Copy link

ghost commented Jun 9, 2021

Draft Pull Request was automatically closed for inactivity. Please let us know if you'd like to reopen it.

@echesakov echesakov force-pushed the Arm64-ASIMD-LoadPairVector64-LoadPairVector128 branch from b78e415 to fab6ef3 Compare June 25, 2021 03:28
@echesakov echesakov marked this pull request as ready for review June 25, 2021 03:31
@echesakov
Copy link
Contributor Author

@imhameed @fanyang-mono @vargaz @naricc Do you have any guidance how to add support for Arm64 intrinsics returning (Vector64<T>, Vector64<T>)/(Vector128<T>, Vector128<T>) in Mono? Is it even feasible to complete in .NET 6?

@echesakov
Copy link
Contributor Author

It looks that LLVM already folds two consecutive SIMD loads into one SIMD ldp instruction, so, if Mono codegen emits IR as in https://godbolt.org/z/7b66oar6M the output for LoadPairVector64/LoadPairVector128 would be the same as the one generated by the JIT.

@imhameed
Copy link
Contributor

Several options for generating an ldp: https://godbolt.org/z/svjEqavPj

The annoying part would be fabricating a ValueTuple (needs an appropriately-sized alloca along with two stores, I guess), which I don't think we have any special handling for, and which I don't know how to do offhand. I'm in the middle of building an arm64 Linux copy of Mono to see what we currently do although this is taking me longer than I'd like because Parallels is mysteriously making my VM drop to 1GB of total memory from 10GB after running for an hour or so... maybe I'll try UTM

@echesakov echesakov marked this pull request as draft June 25, 2021 21:34
@imhameed
Copy link
Contributor

@echesakovMSFT here's an implementation of LoadPairVector for Mono that passes the tests you've added: imhameed@ec909cb

The generated code is sometimes good and sometimes bad; we'll need to improve our calling convention and possibly improve our treatment of value types for more reliably good output. LLVM doesn't seem to have any support for lowering !nontemporal loads to ldnp in 11.0.1 or in main so there's not much we can do about that without emitting a small inline assembly blob.

@echesakov
Copy link
Contributor Author

@imhameed Thanks you for the follow up and implementing support on Mono side!

The generated code is sometimes good and sometimes bad; we'll need to improve our calling convention and possibly improve our treatment of value types for more reliably good output. LLVM doesn't seem to have any support for lowering !nontemporal loads to ldnp in 11.0.1 or in main so there's not much we can do about that without emitting a small inline assembly blob.

Do you think we can take your changes to .NET 6.0? If so, can you please push the commit to this PR? I found some issues with CoreCLR implementation of multi-reg nodes. I hope I will resolve them by .NET 6 deadline and, in that case, we can merge both changes to CoreCLR and Mono as one PR.

@imhameed
Copy link
Contributor

Yeah I think the Mono changes should be fine for .NET 6. I'll push it to your branch, and merging it as part of this PR sounds perfect to me.

Fabricates a `ValueTuple<T, T>` for the result in a local alloca.
@echesakov
Copy link
Contributor Author

I don't think this will make to .NET 6. I found more issues with multi-register nodes implementation and moving this PR and the corresponding issue to future.

@echesakov echesakov modified the milestones: 6.0.0, Future Jul 8, 2021
@echesakov
Copy link
Contributor Author

Will re-open after I fix the issues.

@echesakov echesakov closed this Jul 8, 2021
@ghost ghost locked as resolved and limited conversation to collaborators Aug 7, 2021
@kunalspathak
Copy link
Member

Tagging myself

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
arch-arm64 area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Projects
None yet
Development

Successfully merging this pull request may close these issues.

[Arm64] LoadPairVector64 and LoadPairVector128
4 participants