JIT ARM64-SVE: Add BP_1A to BQ_2B, BU_2A #99245

amanasifkhalid · 2024-03-04T16:23:35Z

Part of #94549. Adds the following encodings:

SVE_BP_1A
SVE_BO_1A
SVE_BQ_2A
SVE_BQ_2B
SVE_BU_2A

cstool output:

sqdecb        x0, w0, pow2
sqdecd        x1, w1, vl1, mul #2
sqdech        x2, w2, vl2, mul #3
sqdecw        x3, w3, vl3, mul #4
sqincb        x4, w4, vl4, mul #5
sqincd        x5, w5, vl5, mul #6
sqinch        x6, w6, vl6, mul #7
sqincw        x7, w7, vl7, mul #8
uqdecb        w8, vl8, mul #9
uqdecd        w9, vl16, mul #10
uqdech        w10, vl32, mul #11
uqdecw        w11, vl64, mul #12
uqincb        w12, vl128, mul #13
uqincd        w13, vl256, mul #14
uqinch        w14, mul4, mul #15
uqincw        w15, all, mul #16
sqdecd        z0.d, vl1
sqdech        z1.h, vl2, mul #2
sqdecw        z2.s, vl3, mul #3
sqincd        z3.d, vl4, mul #4
sqinch        z4.h, vl5, mul #5
sqincw        z5.s, vl6, mul #6
uqdecd        z6.d, vl7, mul #7
uqdech        z7.h, vl8, mul #8
uqdecw        z8.s, vl16, mul #9
uqincd        z9.d, vl32, mul #10
uqinch        z10.h, pow2, mul #11
uqincw        z11.s, all, mul #16
ext   z0.b, { z1.b, z2.b }, #0
ext   z2.b, { z3.b, z4.b }, #5
ext   z4.b, { z5.b, z6.b }, #0x80
ext   z6.b, { z31.b, z0.b }, #0xFF
ext   z0.b, z0.b, z1.b, #0
ext   z2.b, z2.b, z3.b, #0x1F
ext   z4.b, z4.b, z5.b, #0x40
ext   z6.b, z6.b, z7.b, #0xFF
fmov  z0.h, p1/m, #2.00000000
fmov  z2.s, p3/m, #1.00000000
fmov  z4.d, p5/m, #-10.00000000
fmov  z6.h, p7/m, #-0.12500000
fmov  z8.s, p9/m, #31.00000000
fmov  z10.d, p11/m, #0.50000000

JitDisasm output:

sqdecb  x0, w0, pow2
sqdecd  x1, w1, vl1, mul #2
sqdech  x2, w2, vl2, mul #3
sqdecw  x3, w3, vl3, mul #4
sqincb  x4, w4, vl4, mul #5
sqincd  x5, w5, vl5, mul #6
sqinch  x6, w6, vl6, mul #7
sqincw  x7, w7, vl7, mul #8
uqdecb  w8, vl8, mul #9
uqdecd  w9, vl16, mul #10
uqdech  w10, vl32, mul #11
uqdecw  w11, vl64, mul #12
uqincb  w12, vl128, mul #13
uqincd  w13, vl256, mul #14
uqinch  w14, mul4, mul #15
uqincw  w15, all, mul #16
sqdecd  z0.d, vl1
sqdech  z1.h, vl2, mul #2
sqdecw  z2.s, vl3, mul #3
sqincd  z3.d, vl4, mul #4
sqinch  z4.h, vl5, mul #5
sqincw  z5.s, vl6, mul #6
uqdecd  z6.d, vl7, mul #7
uqdech  z7.h, vl8, mul #8
uqdecw  z8.s, vl16, mul #9
uqincd  z9.d, vl32, mul #10
uqinch  z10.h, pow2, mul #11
uqincw  z11.s, all, mul #16
ext     z0.b, {v1.b, v2.b}, #0
ext     z2.b, {v3.b, v4.b}, #5
ext     z4.b, {v5.b, v6.b}, #128
ext     z6.b, {v31.b, v0.b}, #255
ext     z0.b, z0.b, z1.b, #0
ext     z2.b, z2.b, z3.b, #31
ext     z4.b, z4.b, z5.b, #64
ext     z6.b, z6.b, z7.b, #255
fmov    z0.h, p1/m, #2.0000
fmov    z2.s, p3/m, #1.0000
fmov    z4.d, p5/m, #-10.0000
fmov    z6.h, p7/m, #-0.1250
fmov    z8.s, p9/m, #31.0000
fmov    z10.d, p11/m, #0.5000

cc @dotnet/arm64-contrib.

ghost · 2024-03-04T16:23:44Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Part of #94549. Adds the following encodings:

SVE_BP_1A
SVE_BO_1A
SVE_BQ_2A
SVE_BQ_2B
SVE_BU_2A

cstool output:

sqdecb        x0, w0, pow2
sqdecd        x1, w1, vl1, mul #2
sqdech        x2, w2, vl2, mul #3
sqdecw        x3, w3, vl3, mul #4
sqincb        x4, w4, vl4, mul #5
sqincd        x5, w5, vl5, mul #6
sqinch        x6, w6, vl6, mul #7
sqincw        x7, w7, vl7, mul #8
uqdecb        w8, vl8, mul #9
uqdecd        w9, vl16, mul #10
uqdech        w10, vl32, mul #11
uqdecw        w11, vl64, mul #12
uqincb        w12, vl128, mul #13
uqincd        w13, vl256, mul #14
uqinch        w14, mul4, mul #15
uqincw        w15, all, mul #16
sqdecd        z0.d, vl1
sqdech        z1.h, vl2, mul #2
sqdecw        z2.s, vl3, mul #3
sqincd        z3.d, vl4, mul #4
sqinch        z4.h, vl5, mul #5
sqincw        z5.s, vl6, mul #6
uqdecd        z6.d, vl7, mul #7
uqdech        z7.h, vl8, mul #8
uqdecw        z8.s, vl16, mul #9
uqincd        z9.d, vl32, mul #10
uqinch        z10.h, pow2, mul #11
uqincw        z11.s, all, mul #16
ext   z0.b, { z1.b, z2.b }, #0
ext   z2.b, { z3.b, z4.b }, #5
ext   z4.b, { z5.b, z6.b }, #0x80
ext   z6.b, { z31.b, z0.b }, #0xFF
ext   z0.b, z0.b, z1.b, #0
ext   z2.b, z2.b, z3.b, #0x1F
ext   z4.b, z4.b, z5.b, #0x40
ext   z6.b, z6.b, z7.b, #0xFF
fmov  z0.h, p1/m, #2.00000000
fmov  z2.s, p3/m, #1.00000000
fmov  z4.d, p5/m, #-10.00000000
fmov  z6.h, p7/m, #-0.12500000
fmov  z8.s, p9/m, #31.00000000
fmov  z10.d, p11/m, #0.50000000

JitDisasm output:

sqdecb  x0, w0, pow2
sqdecd  x1, w1, vl1, mul #2
sqdech  x2, w2, vl2, mul #3
sqdecw  x3, w3, vl3, mul #4
sqincb  x4, w4, vl4, mul #5
sqincd  x5, w5, vl5, mul #6
sqinch  x6, w6, vl6, mul #7
sqincw  x7, w7, vl7, mul #8
uqdecb  w8, vl8, mul #9
uqdecd  w9, vl16, mul #10
uqdech  w10, vl32, mul #11
uqdecw  w11, vl64, mul #12
uqincb  w12, vl128, mul #13
uqincd  w13, vl256, mul #14
uqinch  w14, mul4, mul #15
uqincw  w15, all, mul #16
sqdecd  z0.d, vl1
sqdech  z1.h, vl2, mul #2
sqdecw  z2.s, vl3, mul #3
sqincd  z3.d, vl4, mul #4
sqinch  z4.h, vl5, mul #5
sqincw  z5.s, vl6, mul #6
uqdecd  z6.d, vl7, mul #7
uqdech  z7.h, vl8, mul #8
uqdecw  z8.s, vl16, mul #9
uqincd  z9.d, vl32, mul #10
uqinch  z10.h, pow2, mul #11
uqincw  z11.s, all, mul #16
ext     z0.b, {v1.b, v2.b}, #0
ext     z2.b, {v3.b, v4.b}, #5
ext     z4.b, {v5.b, v6.b}, #128
ext     z6.b, {v31.b, v0.b}, #255
ext     z0.b, z0.b, z1.b, #0
ext     z2.b, z2.b, z3.b, #31
ext     z4.b, z4.b, z5.b, #64
ext     z6.b, z6.b, z7.b, #255
fmov    z0.h, p1/m, #2.0000
fmov    z2.s, p3/m, #1.0000
fmov    z4.d, p5/m, #-10.0000
fmov    z6.h, p7/m, #-0.1250
fmov    z8.s, p9/m, #31.0000
fmov    z10.d, p11/m, #0.5000

cc @dotnet/arm64-contrib.

Author:	amanasifkhalid
Assignees:	amanasifkhalid
Labels:	`area-CodeGen-coreclr`
Milestone:	-

TIHan · 2024-03-05T01:38:39Z

src/coreclr/jit/emitarm64.cpp

+            imm = emitGetInsSC(id);
+            assert(insOptsScalableAtLeastHalf(id->idInsOpt()));
+            assert(isVectorRegister(id->idReg1()));                           // ddddd
+            assert(isValidSimm8(imm) || isValidUimm8(imm));                   // iiiiiiii


This is validating the encoded imm which is interesting. It does strike me as odd that it's checking if imm is either simm8 or uimm8 - but maybe that is actually fine?

Yeah, I don't love the way this looks. Since this runs in Debug builds only, maybe we should actually try decoding the immediate here to see if it works, instead of taking the easy way out. I'll update this.

TIHan

Looks good Aman!

a74nh · 2024-03-05T09:25:46Z

src/coreclr/jit/emitarm64.cpp

+            fpImm.immFPIVal = (unsigned)imm;
+            assert(insOptsScalableAtLeastHalf(id->idInsOpt()));
+            assert(isVectorRegister(id->idReg1()));                           // ddddd
+            assert(isValidSimm8((ssize_t)emitDecodeFloatImm8(fpImm)));        // iiiiiiii


This line looks wrong. emitDecodeFloatImm8 should be here?

The result of emitDecodeFloatImm8 is being passed to isValidSimm8

kunalspathak · 2024-03-05T14:32:36Z

src/coreclr/jit/emitarm64.cpp

@@ -24398,6 +24512,7 @@ BYTE* emitter::emitOutput_InstrSve(BYTE* dst, instrDesc* id)
        // Immediate and pattern to general purpose.
        case IF_SVE_BL_1A: // ............iiii ......pppppddddd -- SVE element count
        case IF_SVE_BM_1A: // ............iiii ......pppppddddd -- SVE inc/dec register by element count
+        case IF_SVE_BO_1A: // ...........Xiiii ......pppppddddd -- SVE saturating inc/dec register by element count


how are we taking care of 64-bit encoding, where sf field at bit 20 needs to be 1?

SQDECB

Thanks for pointing that out; fixed.

kunalspathak · 2024-03-05T14:39:02Z

src/coreclr/jit/emitarm64.cpp

+            assert(insOptsNone(opt));
+            assert(isGeneralRegister(reg1)); // ddddd
+            assert(isValidUimm4From1(imm));  // iiii
+            assert(size == EA_4BYTE);


what about EA_8BYTE?

kunalspathak · 2024-03-05T14:41:19Z

src/coreclr/jit/codegenarm64test.cpp

@@ -5912,6 +5912,100 @@ void CodeGen::genArm64EmitterUnitTestsSve()
    theEmitter->emitIns_R_PATTERN_I(INS_sve_incw, EA_SCALABLE, REG_V5, SVE_PATTERN_VL6, 16,
                                    INS_OPTS_SCALABLE_S); // INCW <Zdn>.S{, <pattern>{, MUL #<imm>}}

+    // IF_SVE_BO_1A
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_sqdecb, EA_4BYTE, REG_R0, SVE_PATTERN_POW2,


Here are below, the comment says Xdn, so this should be 8 bytes.

Suggested change

theEmitter->emitIns_R_PATTERN_I(INS_sve_sqdecb, EA_4BYTE, REG_R0, SVE_PATTERN_POW2,

theEmitter->emitIns_R_PATTERN_I(INS_sve_sqdecb, EA_8BYTE, REG_R0, SVE_PATTERN_POW2,

I think that's the 32-bit pattern, right? The docs say the 64-bit pattern is <Xdn>{, <pattern>{, MUL #<imm>}}.

kunalspathak · 2024-03-05T14:42:36Z

src/coreclr/jit/codegenarm64test.cpp

+    // IF_SVE_BO_1A
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_sqdecb, EA_4BYTE, REG_R0, SVE_PATTERN_POW2,
+                                    1); // SQDECB <Xdn>, <Wdn>{, <pattern>{, MUL #<imm>}}
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_sqdecd, EA_4BYTE, REG_R1, SVE_PATTERN_VL1,


as mentioned elsewhere, I don't see the test for SQDECB <Xdn>{, <pattern>{, MUL #<imm>}}

kunalspathak · 2024-03-05T14:43:45Z

src/coreclr/jit/codegenarm64test.cpp

+                                    12); // UQDECW <Wdn>{, <pattern>{, MUL #<imm>}}
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_uqincb, EA_4BYTE, REG_R12, SVE_PATTERN_VL128,
+                                    13); // UQINCB <Wdn>{, <pattern>{, MUL #<imm>}}
+    theEmitter->emitIns_R_PATTERN_I(INS_sve_uqincd, EA_4BYTE, REG_R13, SVE_PATTERN_VL256,


likewise, don't see test for UQDECB <Xdn>{, <pattern>{, MUL #<imm>}} the 64-bit variant.

kunalspathak

Can you double check about 64-bit variants of some of the instructions I pointed out.

kunalspathak · 2024-03-05T14:48:27Z

src/coreclr/jit/emitarm64.cpp

+        case IF_SVE_BQ_2A: // ...........iiiii ...iiinnnnnddddd -- SVE extract vector (immediate offset, destructive)
+        case IF_SVE_BQ_2B: // ...........iiiii ...iiimmmmmddddd -- SVE extract vector (immediate offset, destructive)
+            assert(id->idInsOpt() == INS_OPTS_SCALABLE_B);
+            assert(isVectorRegister(id->idReg1())); // ddddd


can we double check what is the requirement for {Zn1, Zn2} like should Zn1 is an odd or even register and what happens if it is Z31, do we have to round it?

According to the docs, there doesn't seem to be any extra requirements for Zn1. Zn2 is calculated by just adding 1 to Zn1, MOD 32, so if Zn1 is 31, Zn2 wraps around to 0. I added a unit test for this that demonstrates the correct behavior -- emitDispVectorRegList seems to already handle this correctly.

amanasifkhalid · 2024-03-05T18:32:39Z

@kunalspathak thanks for the review, I added the 64-bit version of IF_SVE_BO_1A.

kunalspathak

LGTM. Thanks!

amanasifkhalid · 2024-03-05T18:57:24Z

Widespread CI failures are #99320.

amanasifkhalid added 5 commits March 4, 2024 11:12

Add BO_1A

53e90f9

Add BP_1A

692b5fa

Add BQ_2A

a701332

Add BQ_2B

6d4a60b

Add BU_2A

d119f35

ghost assigned amanasifkhalid Mar 4, 2024

dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Mar 4, 2024

amanasifkhalid added the arm-sve Work related to arm64 SVE/SVE2 support label Mar 4, 2024

amanasifkhalid mentioned this pull request Mar 4, 2024

Arm64: Implement SVE encodings #94549

Closed

Fix tests

9fdf6f1

TIHan reviewed Mar 5, 2024

View reviewed changes

TIHan approved these changes Mar 5, 2024

View reviewed changes

amanasifkhalid added 3 commits March 4, 2024 21:07

Fix imm8 assert

a470b77

merge from main

0c019bd

Missed a line

f4974d3

This was referenced Mar 5, 2024

Tracking issue for CI build timeouts #76454

Closed

slow macOS - "##[error]The job running on agent Azure Pipelines 9 ran longer than the maximum time of 60 minutes." dotnet/dnceng#1883

Open

a74nh reviewed Mar 5, 2024

View reviewed changes

kunalspathak reviewed Mar 5, 2024

View reviewed changes

kunalspathak requested changes Mar 5, 2024

View reviewed changes

Add 64-bit version of BO_1A

bc91025

kunalspathak approved these changes Mar 5, 2024

View reviewed changes

build-analysis bot mentioned this pull request Mar 5, 2024

Build failing with WaitSubsystem.Unix.cs(347,26): error CA2265: Comparing a span to 'null' might be redundant, the 'null' literal will be implicitly converted to a 'Span<T>.Empty' #99330

Closed

Merge branch 'main' into BO_1A

fa38476

kunalspathak merged commit 22e4825 into dotnet:main Mar 6, 2024
129 checks passed

amanasifkhalid deleted the BO_1A branch March 6, 2024 15:32

github-actions bot locked and limited conversation to collaborators Apr 7, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

JIT ARM64-SVE: Add BP_1A to BQ_2B, BU_2A #99245

JIT ARM64-SVE: Add BP_1A to BQ_2B, BU_2A #99245

amanasifkhalid commented Mar 4, 2024

ghost commented Mar 4, 2024

TIHan Mar 5, 2024

amanasifkhalid Mar 5, 2024

TIHan left a comment

a74nh Mar 5, 2024

amanasifkhalid Mar 5, 2024

kunalspathak Mar 5, 2024

amanasifkhalid Mar 5, 2024

kunalspathak Mar 5, 2024

kunalspathak Mar 5, 2024

amanasifkhalid Mar 5, 2024

kunalspathak Mar 5, 2024

kunalspathak Mar 5, 2024

kunalspathak left a comment

kunalspathak Mar 5, 2024

amanasifkhalid Mar 5, 2024

amanasifkhalid commented Mar 5, 2024

kunalspathak left a comment

amanasifkhalid commented Mar 5, 2024

	theEmitter->emitIns_R_PATTERN_I(INS_sve_sqdecb, EA_4BYTE, REG_R0, SVE_PATTERN_POW2,
	theEmitter->emitIns_R_PATTERN_I(INS_sve_sqdecb, EA_8BYTE, REG_R0, SVE_PATTERN_POW2,

JIT ARM64-SVE: Add BP_1A to BQ_2B, BU_2A #99245

JIT ARM64-SVE: Add BP_1A to BQ_2B, BU_2A #99245

Conversation

amanasifkhalid commented Mar 4, 2024

ghost commented Mar 4, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

TIHan left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

kunalspathak left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

amanasifkhalid commented Mar 5, 2024

kunalspathak left a comment

Choose a reason for hiding this comment

amanasifkhalid commented Mar 5, 2024