JIT ARM64-SVE: Add FR_2A, GB_2A, FV_2A, FY_3A #98248

amanasifkhalid · 2024-02-09T22:55:30Z

Part of #94549. Adds the following encodings:

IF_SVE_FR_2A
IF_SVE_GB_2A
IF_SVE_FV_2A
IF_SVE_FY_3A

cstool output:

rshrnb        z0.b, z1.h, #1
rshrnt        z2.h, z3.s, #1
shrnb z4.s, z5.d, #1
shrnt z6.b, z7.h, #2
sqrshrnb      z8.h, z9.s, #3
sqrshrnt      z10.s, z11.d, #4
sqrshrunb     z12.b, z13.h, #5
sqrshrunt     z14.h, z15.s, #8
sqshrnb       z16.s, z17.d, #8
sqshrnt       z18.b, z19.h, #6
sqshrunb      z20.h, z21.s, #13
sqshrunt      z22.s, z23.d, #16
uqrshrnb      z24.b, z25.h, #7
uqrshrnt      z26.h, z27.s, #16
uqshrnb       z28.s, z29.d, #32
uqshrnt       z30.b, z31.h, #8
sshllb        z0.h, z1.b, #1
sshllt        z2.h, z3.b, #3
ushllb        z4.h, z5.b, #5
ushllt        z6.h, z7.b, #7
sshllb        z8.s, z9.h, #0
sshllt        z10.s, z11.h, #5
ushllb        z12.s, z13.h, #10
ushllt        z14.s, z15.h, #15
sshllb        z16.d, z17.s, #8
sshllt        z18.d, z19.s, #16
ushllb        z20.d, z21.s, #24
ushllt        z22.d, z23.s, #31
cadd  z0.b, z0.b, z1.b, #90
cadd  z2.h, z2.h, z3.h, #90
cadd  z4.s, z4.s, z5.s, #270
cadd  z6.d, z6.d, z7.d, #270
sqcadd        z8.b, z8.b, z9.b, #270
sqcadd        z10.h, z10.h, z11.h, #270
sqcadd        z12.s, z12.s, z13.s, #90
sqcadd        z14.d, z14.d, z15.d, #90
adclb z0.s, z1.s, z2.s
adclb z3.d, z4.d, z5.d
adclt z6.s, z7.s, z8.s
adclt z9.d, z10.d, z11.d
sbclb z12.s, z13.s, z14.s
sbclb z15.d, z16.d, z17.d
sbclt z18.s, z19.s, z20.s
sbclt z21.d, z22.d, z23.d

JitDisasm output:

rshrnb  z0.b, z1.h, #1
rshrnt  z2.h, z3.s, #1
shrnb   z4.s, z5.d, #1
shrnt   z6.b, z7.h, #2
sqrshrnb z8.h, z9.s, #3
sqrshrnt z10.s, z11.d, #4
sqrshrunb z12.b, z13.h, #5
sqrshrunt z14.h, z15.s, #8
sqshrnb z16.s, z17.d, #8
sqshrnt z18.b, z19.h, #6
sqshrunb z20.h, z21.s, #13
sqshrunt z22.s, z23.d, #16
uqrshrnb z24.b, z25.h, #7
uqrshrnt z26.h, z27.s, #16
uqshrnb z28.s, z29.d, #32
uqshrnt z30.b, z31.h, #8
sshllb  z0.h, z1.b, #1
sshllt  z2.h, z3.b, #3
ushllb  z4.h, z5.b, #5
ushllt  z6.h, z7.b, #7
sshllb  z8.s, z9.h, #0
sshllt  z10.s, z11.h, #5
ushllb  z12.s, z13.h, #10
ushllt  z14.s, z15.h, #15
sshllb  z16.d, z17.s, #8
sshllt  z18.d, z19.s, #16
ushllb  z20.d, z21.s, #24
ushllt  z22.d, z23.s, #31
cadd    z0.b, z0.b, z1.b, #90
cadd    z2.h, z2.h, z3.h, #90
cadd    z4.s, z4.s, z5.s, #270
cadd    z6.d, z6.d, z7.d, #270
sqcadd  z8.b, z8.b, z9.b, #270
sqcadd  z10.h, z10.h, z11.h, #270
sqcadd  z12.s, z12.s, z13.s, #90
sqcadd  z14.d, z14.d, z15.d, #90
adclb   z0.s, z1.s, z2.s
adclb   z3.d, z4.d, z5.d
adclt   z6.s, z7.s, z8.s
adclt   z9.d, z10.d, z11.d
sbclb   z12.s, z13.s, z14.s
sbclb   z15.d, z16.d, z17.d
sbclt   z18.s, z19.s, z20.s
sbclt   z21.d, z22.d, z23.d

cc @dotnet/arm64-contrib

ghost · 2024-02-09T22:55:41Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Part of #94549. Adds the following encodings:

IF_SVE_FR_2A
IF_SVE_GB_2A
IF_SVE_FV_2A
IF_SVE_FY_3A

cstool output:

rshrnb        z0.b, z1.h, #1
rshrnt        z2.h, z3.s, #1
shrnb z4.s, z5.d, #1
shrnt z6.b, z7.h, #2
sqrshrnb      z8.h, z9.s, #3
sqrshrnt      z10.s, z11.d, #4
sqrshrunb     z12.b, z13.h, #5
sqrshrunt     z14.h, z15.s, #8
sqshrnb       z16.s, z17.d, #8
sqshrnt       z18.b, z19.h, #6
sqshrunb      z20.h, z21.s, #13
sqshrunt      z22.s, z23.d, #16
uqrshrnb      z24.b, z25.h, #7
uqrshrnt      z26.h, z27.s, #16
uqshrnb       z28.s, z29.d, #32
uqshrnt       z30.b, z31.h, #8
sshllb        z0.h, z1.b, #1
sshllt        z2.h, z3.b, #3
ushllb        z4.h, z5.b, #5
ushllt        z6.h, z7.b, #7
sshllb        z8.s, z9.h, #0
sshllt        z10.s, z11.h, #5
ushllb        z12.s, z13.h, #10
ushllt        z14.s, z15.h, #15
sshllb        z16.d, z17.s, #8
sshllt        z18.d, z19.s, #16
ushllb        z20.d, z21.s, #24
ushllt        z22.d, z23.s, #31
cadd  z0.b, z0.b, z1.b, #90
cadd  z2.h, z2.h, z3.h, #90
cadd  z4.s, z4.s, z5.s, #270
cadd  z6.d, z6.d, z7.d, #270
sqcadd        z8.b, z8.b, z9.b, #270
sqcadd        z10.h, z10.h, z11.h, #270
sqcadd        z12.s, z12.s, z13.s, #90
sqcadd        z14.d, z14.d, z15.d, #90
adclb z0.s, z1.s, z2.s
adclb z3.d, z4.d, z5.d
adclt z6.s, z7.s, z8.s
adclt z9.d, z10.d, z11.d
sbclb z12.s, z13.s, z14.s
sbclb z15.d, z16.d, z17.d
sbclt z18.s, z19.s, z20.s
sbclt z21.d, z22.d, z23.d

JitDisasm output:

rshrnb  z0.b, z1.h, #1
rshrnt  z2.h, z3.s, #1
shrnb   z4.s, z5.d, #1
shrnt   z6.b, z7.h, #2
sqrshrnb z8.h, z9.s, #3
sqrshrnt z10.s, z11.d, #4
sqrshrunb z12.b, z13.h, #5
sqrshrunt z14.h, z15.s, #8
sqshrnb z16.s, z17.d, #8
sqshrnt z18.b, z19.h, #6
sqshrunb z20.h, z21.s, #13
sqshrunt z22.s, z23.d, #16
uqrshrnb z24.b, z25.h, #7
uqrshrnt z26.h, z27.s, #16
uqshrnb z28.s, z29.d, #32
uqshrnt z30.b, z31.h, #8
sshllb  z0.h, z1.b, #1
sshllt  z2.h, z3.b, #3
ushllb  z4.h, z5.b, #5
ushllt  z6.h, z7.b, #7
sshllb  z8.s, z9.h, #0
sshllt  z10.s, z11.h, #5
ushllb  z12.s, z13.h, #10
ushllt  z14.s, z15.h, #15
sshllb  z16.d, z17.s, #8
sshllt  z18.d, z19.s, #16
ushllb  z20.d, z21.s, #24
ushllt  z22.d, z23.s, #31
cadd    z0.b, z0.b, z1.b, #90
cadd    z2.h, z2.h, z3.h, #90
cadd    z4.s, z4.s, z5.s, #270
cadd    z6.d, z6.d, z7.d, #270
sqcadd  z8.b, z8.b, z9.b, #270
sqcadd  z10.h, z10.h, z11.h, #270
sqcadd  z12.s, z12.s, z13.s, #90
sqcadd  z14.d, z14.d, z15.d, #90
adclb   z0.s, z1.s, z2.s
adclb   z3.d, z4.d, z5.d
adclt   z6.s, z7.s, z8.s
adclt   z9.d, z10.d, z11.d
sbclb   z12.s, z13.s, z14.s
sbclb   z15.d, z16.d, z17.d
sbclt   z18.s, z19.s, z20.s
sbclt   z21.d, z22.d, z23.d

cc @dotnet/arm64-contrib

Author:	amanasifkhalid
Assignees:	amanasifkhalid
Labels:	`area-CodeGen-coreclr`, `arch-arm64-sve`
Milestone:	-

ryujit-bot · 2024-02-10T00:17:29Z

Diff results for #98248

Throughput diffs

Throughput diffs for linux/arm64 ran on linux/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
libraries.crossgen2.linux.arm64.checked.mch	+0.01%

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.linux.arm64.checked.mch	+0.01%
benchmarks.run_tiered.linux.arm64.checked.mch	+0.01%
coreclr_tests.run.linux.arm64.checked.mch	+0.01%
libraries.crossgen2.linux.arm64.checked.mch	+0.01%
libraries.pmi.linux.arm64.checked.mch	+0.01%
libraries_tests_no_tiered_compilation.run.linux.arm64.Release.mch	+0.01%
realworld.run.linux.arm64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
libraries.crossgen2.linux.arm64.checked.mch	+0.01%

Throughput diffs for osx/arm64 ran on linux/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_tiered.osx.arm64.checked.mch	+0.01%
libraries.crossgen2.osx.arm64.checked.mch	+0.01%

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.osx.arm64.checked.mch	+0.01%
benchmarks.run_pgo.osx.arm64.checked.mch	+0.01%
benchmarks.run_tiered.osx.arm64.checked.mch	+0.01%
coreclr_tests.run.osx.arm64.checked.mch	+0.01%
libraries.crossgen2.osx.arm64.checked.mch	+0.01%
libraries.pmi.osx.arm64.checked.mch	+0.01%
libraries_tests_no_tiered_compilation.run.osx.arm64.Release.mch	+0.01%
realworld.run.osx.arm64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
libraries.crossgen2.osx.arm64.checked.mch	+0.01%

Throughput diffs for windows/arm64 ran on linux/x64

Overall (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run_tiered.windows.arm64.checked.mch	+0.01%
libraries.crossgen2.windows.arm64.checked.mch	+0.01%

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
benchmarks.run.windows.arm64.checked.mch	+0.01%
benchmarks.run_pgo.windows.arm64.checked.mch	+0.01%
benchmarks.run_tiered.windows.arm64.checked.mch	+0.01%
coreclr_tests.run.windows.arm64.checked.mch	+0.01%
libraries.crossgen2.windows.arm64.checked.mch	+0.01%
libraries.pmi.windows.arm64.checked.mch	+0.01%
libraries_tests.run.windows.arm64.Release.mch	+0.01%
libraries_tests_no_tiered_compilation.run.windows.arm64.Release.mch	+0.01%
realworld.run.windows.arm64.checked.mch	+0.01%

FullOpts (+0.00% to +0.01%)

Collection	PDIFF
libraries.crossgen2.windows.arm64.checked.mch	+0.01%

Details here

a74nh

LGTM

ryujit-bot · 2024-02-12T15:09:40Z

Diff results for #98248

Throughput diffs

Throughput diffs for osx/arm64 ran on windows/x64

MinOpts (-0.00% to +0.01%)

Collection	PDIFF
libraries.pmi.osx.arm64.checked.mch	+0.01%

Throughput diffs for windows/arm64 ran on windows/x64

MinOpts (-0.00% to +0.01%)

Collection	PDIFF
realworld.run.windows.arm64.checked.mch	+0.01%

Details here

Throughput diffs for linux/arm64 ran on linux/x64

MinOpts (-0.01% to -0.00%)

Collection	PDIFF
smoke_tests.nativeaot.linux.arm64.checked.mch	-0.01%
libraries.crossgen2.linux.arm64.checked.mch	-0.01%
benchmarks.run.linux.arm64.checked.mch	-0.01%

Details here

kunalspathak

LGTM

kunalspathak · 2024-02-12T19:24:28Z

src/coreclr/jit/emitarm64.cpp

+            assert(insOptsScalableStandard(opt));
+            assert(isVectorRegister(reg1));                        // ddddd
+            assert(isVectorRegister(reg2));                        // nnnnn
+            assert((imm == 90) || (imm == 270));                   // r


Suggested change

assert((imm == 90) || (imm == 270)); // r

assert(emitIsValidEncodedRotationImm90_or_270(imm)); // r

emitIsValidEncodedRotationImm90_or_270 checks if imm is 0 or 1. Maybe I can add isValidRot90_or_270?

I think you should add emitIsValidEncodedRotationImm90_or_270() checks around line 1617 i.e. inside emitInsSanityCheck() the way @TIHan added. Can you make it consistent to the way we have e.g. IF_SVE_GP_3A?

kunalspathak · 2024-02-12T19:26:41Z

src/coreclr/jit/emitarm64.h

+// Returns true if 'value' is a legal unsigned immediate 3 bit encoding, starting from 1 (such as for SHRNB).
+static bool isValidUimm3From1(ssize_t value)
+{
+    return (1 <= value) && (value <= 8);


some of the numbers in isValid* are hex while others are decimal, do you mind changing to hex wherever it makes sense?

Sure thing. By "wherever it makes sense," do you mean we should only convert decimal numbers to hex if they're >=10? Because in the above example, there wouldn't be anything to change.

yeah it was a nit comment, not for this particular example, but there are examples of <= 15 that can be converted to <= 0xF but other places like (value <= 224) can stay decimals.

ryujit-bot · 2024-02-12T22:10:55Z

Diff results for #98248

Throughput diffs

Throughput diffs for osx/arm64 ran on linux/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
libraries.pmi.osx.arm64.checked.mch	+0.01%
realworld.run.osx.arm64.checked.mch	+0.01%

Details here

amanasifkhalid · 2024-02-12T22:35:02Z

@kunalspathak thanks for the review, I've addressed your feedback. Is it ok if I merge this?

kunalspathak

few things to take care of.

kunalspathak · 2024-02-12T22:54:47Z

src/coreclr/jit/emitarm64.cpp

@@ -1679,7 +1753,7 @@ void emitter::emitInsSanityCheck(instrDesc* id)
            assert(insOptsScalableStandard(id->idInsOpt()));
            assert(isVectorRegister(id->idReg1()));                           // ddddd
            assert(isVectorRegister(id->idReg2()));                           // nnnnn
-            assert(isValidUimm2(emitGetInsSC(id)));                           // rr
+            assert(emitIsValidEncodedRotationImm0_to_270(emitGetInsSC(id)));  // rr


thanks for fixing this.

kunalspathak · 2024-02-12T22:56:03Z

src/coreclr/jit/emitarm64.cpp

+            code |= insEncodeReg_V_9_to_5(id->idReg2());                  // mmmmm
+            code |= insEncodeImm1_10(emitGetInsSC(id));                   // r
+            code |= insEncodeElemsize(optGetSveElemsize(id->idInsOpt())); // xx
+            dst += emitOutput_Instr(dst, code);


missing check for 90 or 270?

We check for this in emitIns_R_R_I and in emitInsSanityCheck. Do you want me to check for this in emitOutputInstr too? I suppose I can add a dedicated insEncodeRot* method for encoding the bit at location 10 to make the meaning of r more clear.

kunalspathak · 2024-02-12T22:56:54Z

src/coreclr/jit/emitarm64.cpp

+        case IF_SVE_FV_2A: // ........xx...... .....rmmmmmddddd -- SVE2 complex integer add
+        {
+            // Rotation bit implies rotation is 270 if set, else rotation is 90
+            const ssize_t rot = (emitGetInsSC(id) == 0) ? 90 : 270;


can you use decode function for this and at other places if they are missing?

Sure thing, thanks for catching that.

ryujit-bot · 2024-02-12T23:11:10Z

Diff results for #98248

Throughput diffs

Throughput diffs for osx/arm64 ran on windows/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
libraries.pmi.osx.arm64.checked.mch	+0.01%
realworld.run.osx.arm64.checked.mch	+0.01%

Details here

Throughput diffs for linux/arm64 ran on linux/x64

MinOpts (-0.01% to -0.00%)

Collection	PDIFF
smoke_tests.nativeaot.linux.arm64.checked.mch	-0.01%
benchmarks.run.linux.arm64.checked.mch	-0.01%
libraries.crossgen2.linux.arm64.checked.mch	-0.01%

Details here

ryujit-bot · 2024-02-13T02:11:43Z

Diff results for #98248

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

MinOpts (+0.00% to +0.01%)

Collection	PDIFF
realworld.run.linux.arm64.checked.mch	+0.01%

Details here

Throughput diffs for linux/arm64 ran on linux/x64

MinOpts (-0.01% to -0.00%)

Collection	PDIFF
libraries.crossgen2.linux.arm64.checked.mch	-0.01%
benchmarks.run.linux.arm64.checked.mch	-0.01%
smoke_tests.nativeaot.linux.arm64.checked.mch	-0.01%

Details here

amanasifkhalid added 5 commits February 9, 2024 13:37

wip

dff1ec1

Add IF_SVE_FR_2A, IF_SVE_GB_2A

1697ebb

Add IF_SVE_FV_2A

687e888

Add IF_SVE_FY_3A

03ea816

Fix comment

eb9929e

amanasifkhalid added the arm-sve Work related to arm64 SVE/SVE2 support label Feb 9, 2024

dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Feb 9, 2024

ghost assigned amanasifkhalid Feb 9, 2024

amanasifkhalid mentioned this pull request Feb 9, 2024

Arm64: Implement SVE encodings #94549

Closed

build-analysis bot mentioned this pull request Feb 10, 2024

Tests crashing in CI with no dump: exit code 137 means SIGKILL Killed #97049

Closed

Merge from main

ea6f3b6

build-analysis bot mentioned this pull request Feb 12, 2024

System.Net.Security.Tests.SslStreamCertificateContextOcspLinuxTests.RefreshOcspResponse_BeforeExpiration test failure #97779

Closed

a74nh approved these changes Feb 12, 2024

View reviewed changes

kunalspathak approved these changes Feb 12, 2024

View reviewed changes

amanasifkhalid added 3 commits February 12, 2024 14:41

Add isValidRot90_or_270

840ae08

Remove isValidRot90_to_270; consistent rot checks

58ccf0b

Fix hex values

6e08361

kunalspathak reviewed Feb 12, 2024

View reviewed changes

Use rotation decode helper

03071da

kunalspathak approved these changes Feb 13, 2024

View reviewed changes

kunalspathak merged commit 5efaf50 into dotnet:main Feb 13, 2024
127 of 129 checks passed

amanasifkhalid deleted the sve-fr-2a branch February 13, 2024 16:16

github-actions bot locked and limited conversation to collaborators Mar 16, 2024

	assert((imm == 90) \|\| (imm == 270)); // r
	assert(emitIsValidEncodedRotationImm90_or_270(imm)); // r

JIT ARM64-SVE: Add FR_2A, GB_2A, FV_2A, FY_3A #98248

JIT ARM64-SVE: Add FR_2A, GB_2A, FV_2A, FY_3A #98248

Conversation

amanasifkhalid commented Feb 9, 2024

ghost commented Feb 9, 2024

ryujit-bot commented Feb 10, 2024

Throughput diffs

Throughput diffs for linux/arm64 ran on linux/x64

Throughput diffs for osx/arm64 ran on linux/x64

Throughput diffs for windows/arm64 ran on linux/x64

a74nh left a comment

Choose a reason for hiding this comment

ryujit-bot commented Feb 12, 2024

Throughput diffs

Throughput diffs for osx/arm64 ran on windows/x64

Throughput diffs for windows/arm64 ran on windows/x64

Throughput diffs for linux/arm64 ran on linux/x64

kunalspathak left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ryujit-bot commented Feb 12, 2024

Throughput diffs

Throughput diffs for osx/arm64 ran on linux/x64

amanasifkhalid commented Feb 12, 2024

kunalspathak left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ryujit-bot commented Feb 12, 2024

Throughput diffs

Throughput diffs for osx/arm64 ran on windows/x64

Throughput diffs for linux/arm64 ran on linux/x64

ryujit-bot commented Feb 13, 2024

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

Throughput diffs for linux/arm64 ran on linux/x64