Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT ARM64-SVE: Add FR_2A, GB_2A, FV_2A, FY_3A #98248

Merged
merged 10 commits into from
Feb 13, 2024

Conversation

amanasifkhalid
Copy link
Member

Part of #94549. Adds the following encodings:

  • IF_SVE_FR_2A
  • IF_SVE_GB_2A
  • IF_SVE_FV_2A
  • IF_SVE_FY_3A

cstool output:

rshrnb        z0.b, z1.h, #1
rshrnt        z2.h, z3.s, #1
shrnb z4.s, z5.d, #1
shrnt z6.b, z7.h, #2
sqrshrnb      z8.h, z9.s, #3
sqrshrnt      z10.s, z11.d, #4
sqrshrunb     z12.b, z13.h, #5
sqrshrunt     z14.h, z15.s, #8
sqshrnb       z16.s, z17.d, #8
sqshrnt       z18.b, z19.h, #6
sqshrunb      z20.h, z21.s, #13
sqshrunt      z22.s, z23.d, #16
uqrshrnb      z24.b, z25.h, #7
uqrshrnt      z26.h, z27.s, #16
uqshrnb       z28.s, z29.d, #32
uqshrnt       z30.b, z31.h, #8
sshllb        z0.h, z1.b, #1
sshllt        z2.h, z3.b, #3
ushllb        z4.h, z5.b, #5
ushllt        z6.h, z7.b, #7
sshllb        z8.s, z9.h, #0
sshllt        z10.s, z11.h, #5
ushllb        z12.s, z13.h, #10
ushllt        z14.s, z15.h, #15
sshllb        z16.d, z17.s, #8
sshllt        z18.d, z19.s, #16
ushllb        z20.d, z21.s, #24
ushllt        z22.d, z23.s, #31
cadd  z0.b, z0.b, z1.b, #90
cadd  z2.h, z2.h, z3.h, #90
cadd  z4.s, z4.s, z5.s, #270
cadd  z6.d, z6.d, z7.d, #270
sqcadd        z8.b, z8.b, z9.b, #270
sqcadd        z10.h, z10.h, z11.h, #270
sqcadd        z12.s, z12.s, z13.s, #90
sqcadd        z14.d, z14.d, z15.d, #90
adclb z0.s, z1.s, z2.s
adclb z3.d, z4.d, z5.d
adclt z6.s, z7.s, z8.s
adclt z9.d, z10.d, z11.d
sbclb z12.s, z13.s, z14.s
sbclb z15.d, z16.d, z17.d
sbclt z18.s, z19.s, z20.s
sbclt z21.d, z22.d, z23.d

JitDisasm output:

rshrnb  z0.b, z1.h, #1
rshrnt  z2.h, z3.s, #1
shrnb   z4.s, z5.d, #1
shrnt   z6.b, z7.h, #2
sqrshrnb z8.h, z9.s, #3
sqrshrnt z10.s, z11.d, #4
sqrshrunb z12.b, z13.h, #5
sqrshrunt z14.h, z15.s, #8
sqshrnb z16.s, z17.d, #8
sqshrnt z18.b, z19.h, #6
sqshrunb z20.h, z21.s, #13
sqshrunt z22.s, z23.d, #16
uqrshrnb z24.b, z25.h, #7
uqrshrnt z26.h, z27.s, #16
uqshrnb z28.s, z29.d, #32
uqshrnt z30.b, z31.h, #8
sshllb  z0.h, z1.b, #1
sshllt  z2.h, z3.b, #3
ushllb  z4.h, z5.b, #5
ushllt  z6.h, z7.b, #7
sshllb  z8.s, z9.h, #0
sshllt  z10.s, z11.h, #5
ushllb  z12.s, z13.h, #10
ushllt  z14.s, z15.h, #15
sshllb  z16.d, z17.s, #8
sshllt  z18.d, z19.s, #16
ushllb  z20.d, z21.s, #24
ushllt  z22.d, z23.s, #31
cadd    z0.b, z0.b, z1.b, #90
cadd    z2.h, z2.h, z3.h, #90
cadd    z4.s, z4.s, z5.s, #270
cadd    z6.d, z6.d, z7.d, #270
sqcadd  z8.b, z8.b, z9.b, #270
sqcadd  z10.h, z10.h, z11.h, #270
sqcadd  z12.s, z12.s, z13.s, #90
sqcadd  z14.d, z14.d, z15.d, #90
adclb   z0.s, z1.s, z2.s
adclb   z3.d, z4.d, z5.d
adclt   z6.s, z7.s, z8.s
adclt   z9.d, z10.d, z11.d
sbclb   z12.s, z13.s, z14.s
sbclb   z15.d, z16.d, z17.d
sbclt   z18.s, z19.s, z20.s
sbclt   z21.d, z22.d, z23.d

cc @dotnet/arm64-contrib

@amanasifkhalid amanasifkhalid added the arm-sve Work related to arm64 SVE/SVE2 support label Feb 9, 2024
@dotnet-issue-labeler dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Feb 9, 2024
@ghost ghost assigned amanasifkhalid Feb 9, 2024
@ghost
Copy link

ghost commented Feb 9, 2024

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details

Part of #94549. Adds the following encodings:

  • IF_SVE_FR_2A
  • IF_SVE_GB_2A
  • IF_SVE_FV_2A
  • IF_SVE_FY_3A

cstool output:

rshrnb        z0.b, z1.h, #1
rshrnt        z2.h, z3.s, #1
shrnb z4.s, z5.d, #1
shrnt z6.b, z7.h, #2
sqrshrnb      z8.h, z9.s, #3
sqrshrnt      z10.s, z11.d, #4
sqrshrunb     z12.b, z13.h, #5
sqrshrunt     z14.h, z15.s, #8
sqshrnb       z16.s, z17.d, #8
sqshrnt       z18.b, z19.h, #6
sqshrunb      z20.h, z21.s, #13
sqshrunt      z22.s, z23.d, #16
uqrshrnb      z24.b, z25.h, #7
uqrshrnt      z26.h, z27.s, #16
uqshrnb       z28.s, z29.d, #32
uqshrnt       z30.b, z31.h, #8
sshllb        z0.h, z1.b, #1
sshllt        z2.h, z3.b, #3
ushllb        z4.h, z5.b, #5
ushllt        z6.h, z7.b, #7
sshllb        z8.s, z9.h, #0
sshllt        z10.s, z11.h, #5
ushllb        z12.s, z13.h, #10
ushllt        z14.s, z15.h, #15
sshllb        z16.d, z17.s, #8
sshllt        z18.d, z19.s, #16
ushllb        z20.d, z21.s, #24
ushllt        z22.d, z23.s, #31
cadd  z0.b, z0.b, z1.b, #90
cadd  z2.h, z2.h, z3.h, #90
cadd  z4.s, z4.s, z5.s, #270
cadd  z6.d, z6.d, z7.d, #270
sqcadd        z8.b, z8.b, z9.b, #270
sqcadd        z10.h, z10.h, z11.h, #270
sqcadd        z12.s, z12.s, z13.s, #90
sqcadd        z14.d, z14.d, z15.d, #90
adclb z0.s, z1.s, z2.s
adclb z3.d, z4.d, z5.d
adclt z6.s, z7.s, z8.s
adclt z9.d, z10.d, z11.d
sbclb z12.s, z13.s, z14.s
sbclb z15.d, z16.d, z17.d
sbclt z18.s, z19.s, z20.s
sbclt z21.d, z22.d, z23.d

JitDisasm output:

rshrnb  z0.b, z1.h, #1
rshrnt  z2.h, z3.s, #1
shrnb   z4.s, z5.d, #1
shrnt   z6.b, z7.h, #2
sqrshrnb z8.h, z9.s, #3
sqrshrnt z10.s, z11.d, #4
sqrshrunb z12.b, z13.h, #5
sqrshrunt z14.h, z15.s, #8
sqshrnb z16.s, z17.d, #8
sqshrnt z18.b, z19.h, #6
sqshrunb z20.h, z21.s, #13
sqshrunt z22.s, z23.d, #16
uqrshrnb z24.b, z25.h, #7
uqrshrnt z26.h, z27.s, #16
uqshrnb z28.s, z29.d, #32
uqshrnt z30.b, z31.h, #8
sshllb  z0.h, z1.b, #1
sshllt  z2.h, z3.b, #3
ushllb  z4.h, z5.b, #5
ushllt  z6.h, z7.b, #7
sshllb  z8.s, z9.h, #0
sshllt  z10.s, z11.h, #5
ushllb  z12.s, z13.h, #10
ushllt  z14.s, z15.h, #15
sshllb  z16.d, z17.s, #8
sshllt  z18.d, z19.s, #16
ushllb  z20.d, z21.s, #24
ushllt  z22.d, z23.s, #31
cadd    z0.b, z0.b, z1.b, #90
cadd    z2.h, z2.h, z3.h, #90
cadd    z4.s, z4.s, z5.s, #270
cadd    z6.d, z6.d, z7.d, #270
sqcadd  z8.b, z8.b, z9.b, #270
sqcadd  z10.h, z10.h, z11.h, #270
sqcadd  z12.s, z12.s, z13.s, #90
sqcadd  z14.d, z14.d, z15.d, #90
adclb   z0.s, z1.s, z2.s
adclb   z3.d, z4.d, z5.d
adclt   z6.s, z7.s, z8.s
adclt   z9.d, z10.d, z11.d
sbclb   z12.s, z13.s, z14.s
sbclb   z15.d, z16.d, z17.d
sbclt   z18.s, z19.s, z20.s
sbclt   z21.d, z22.d, z23.d

cc @dotnet/arm64-contrib

Author: amanasifkhalid
Assignees: amanasifkhalid
Labels:

area-CodeGen-coreclr, arch-arm64-sve

Milestone: -

@ryujit-bot
Copy link

Diff results for #98248

Throughput diffs

Throughput diffs for linux/arm64 ran on linux/x64

Overall (+0.00% to +0.01%)
Collection PDIFF
libraries.crossgen2.linux.arm64.checked.mch +0.01%
MinOpts (+0.00% to +0.01%)
Collection PDIFF
benchmarks.run.linux.arm64.checked.mch +0.01%
benchmarks.run_tiered.linux.arm64.checked.mch +0.01%
coreclr_tests.run.linux.arm64.checked.mch +0.01%
libraries.crossgen2.linux.arm64.checked.mch +0.01%
libraries.pmi.linux.arm64.checked.mch +0.01%
libraries_tests_no_tiered_compilation.run.linux.arm64.Release.mch +0.01%
realworld.run.linux.arm64.checked.mch +0.01%
FullOpts (+0.00% to +0.01%)
Collection PDIFF
libraries.crossgen2.linux.arm64.checked.mch +0.01%

Throughput diffs for osx/arm64 ran on linux/x64

Overall (+0.00% to +0.01%)
Collection PDIFF
benchmarks.run_tiered.osx.arm64.checked.mch +0.01%
libraries.crossgen2.osx.arm64.checked.mch +0.01%
MinOpts (+0.00% to +0.01%)
Collection PDIFF
benchmarks.run.osx.arm64.checked.mch +0.01%
benchmarks.run_pgo.osx.arm64.checked.mch +0.01%
benchmarks.run_tiered.osx.arm64.checked.mch +0.01%
coreclr_tests.run.osx.arm64.checked.mch +0.01%
libraries.crossgen2.osx.arm64.checked.mch +0.01%
libraries.pmi.osx.arm64.checked.mch +0.01%
libraries_tests_no_tiered_compilation.run.osx.arm64.Release.mch +0.01%
realworld.run.osx.arm64.checked.mch +0.01%
FullOpts (+0.00% to +0.01%)
Collection PDIFF
libraries.crossgen2.osx.arm64.checked.mch +0.01%

Throughput diffs for windows/arm64 ran on linux/x64

Overall (+0.00% to +0.01%)
Collection PDIFF
benchmarks.run_tiered.windows.arm64.checked.mch +0.01%
libraries.crossgen2.windows.arm64.checked.mch +0.01%
MinOpts (+0.00% to +0.01%)
Collection PDIFF
benchmarks.run.windows.arm64.checked.mch +0.01%
benchmarks.run_pgo.windows.arm64.checked.mch +0.01%
benchmarks.run_tiered.windows.arm64.checked.mch +0.01%
coreclr_tests.run.windows.arm64.checked.mch +0.01%
libraries.crossgen2.windows.arm64.checked.mch +0.01%
libraries.pmi.windows.arm64.checked.mch +0.01%
libraries_tests.run.windows.arm64.Release.mch +0.01%
libraries_tests_no_tiered_compilation.run.windows.arm64.Release.mch +0.01%
realworld.run.windows.arm64.checked.mch +0.01%
FullOpts (+0.00% to +0.01%)
Collection PDIFF
libraries.crossgen2.windows.arm64.checked.mch +0.01%

Details here


Copy link
Contributor

@a74nh a74nh left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@ryujit-bot
Copy link

Diff results for #98248

Throughput diffs

Throughput diffs for osx/arm64 ran on windows/x64

MinOpts (-0.00% to +0.01%)
Collection PDIFF
libraries.pmi.osx.arm64.checked.mch +0.01%

Throughput diffs for windows/arm64 ran on windows/x64

MinOpts (-0.00% to +0.01%)
Collection PDIFF
realworld.run.windows.arm64.checked.mch +0.01%

Details here


Throughput diffs for linux/arm64 ran on linux/x64

MinOpts (-0.01% to -0.00%)
Collection PDIFF
smoke_tests.nativeaot.linux.arm64.checked.mch -0.01%
libraries.crossgen2.linux.arm64.checked.mch -0.01%
benchmarks.run.linux.arm64.checked.mch -0.01%

Details here


Copy link
Member

@kunalspathak kunalspathak left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

assert(insOptsScalableStandard(opt));
assert(isVectorRegister(reg1)); // ddddd
assert(isVectorRegister(reg2)); // nnnnn
assert((imm == 90) || (imm == 270)); // r
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
assert((imm == 90) || (imm == 270)); // r
assert(emitIsValidEncodedRotationImm90_or_270(imm)); // r

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

emitIsValidEncodedRotationImm90_or_270 checks if imm is 0 or 1. Maybe I can add isValidRot90_or_270?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should add emitIsValidEncodedRotationImm90_or_270() checks around line 1617 i.e. inside emitInsSanityCheck() the way @TIHan added. Can you make it consistent to the way we have e.g. IF_SVE_GP_3A?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing

// Returns true if 'value' is a legal unsigned immediate 3 bit encoding, starting from 1 (such as for SHRNB).
static bool isValidUimm3From1(ssize_t value)
{
return (1 <= value) && (value <= 8);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some of the numbers in isValid* are hex while others are decimal, do you mind changing to hex wherever it makes sense?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing. By "wherever it makes sense," do you mean we should only convert decimal numbers to hex if they're >=10? Because in the above example, there wouldn't be anything to change.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah it was a nit comment, not for this particular example, but there are examples of <= 15 that can be converted to <= 0xF but other places like (value <= 224) can stay decimals.

@ryujit-bot
Copy link

Diff results for #98248

Throughput diffs

Throughput diffs for osx/arm64 ran on linux/x64

MinOpts (+0.00% to +0.01%)
Collection PDIFF
libraries.pmi.osx.arm64.checked.mch +0.01%
realworld.run.osx.arm64.checked.mch +0.01%

Details here


@amanasifkhalid
Copy link
Member Author

@kunalspathak thanks for the review, I've addressed your feedback. Is it ok if I merge this?

Copy link
Member

@kunalspathak kunalspathak left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

few things to take care of.

@@ -1679,7 +1753,7 @@ void emitter::emitInsSanityCheck(instrDesc* id)
assert(insOptsScalableStandard(id->idInsOpt()));
assert(isVectorRegister(id->idReg1())); // ddddd
assert(isVectorRegister(id->idReg2())); // nnnnn
assert(isValidUimm2(emitGetInsSC(id))); // rr
assert(emitIsValidEncodedRotationImm0_to_270(emitGetInsSC(id))); // rr
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for fixing this.

code |= insEncodeReg_V_9_to_5(id->idReg2()); // mmmmm
code |= insEncodeImm1_10(emitGetInsSC(id)); // r
code |= insEncodeElemsize(optGetSveElemsize(id->idInsOpt())); // xx
dst += emitOutput_Instr(dst, code);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing check for 90 or 270?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We check for this in emitIns_R_R_I and in emitInsSanityCheck. Do you want me to check for this in emitOutputInstr too? I suppose I can add a dedicated insEncodeRot* method for encoding the bit at location 10 to make the meaning of r more clear.

case IF_SVE_FV_2A: // ........xx...... .....rmmmmmddddd -- SVE2 complex integer add
{
// Rotation bit implies rotation is 270 if set, else rotation is 90
const ssize_t rot = (emitGetInsSC(id) == 0) ? 90 : 270;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use decode function for this and at other places if they are missing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing, thanks for catching that.

@ryujit-bot
Copy link

Diff results for #98248

Throughput diffs

Throughput diffs for osx/arm64 ran on windows/x64

MinOpts (+0.00% to +0.01%)
Collection PDIFF
libraries.pmi.osx.arm64.checked.mch +0.01%
realworld.run.osx.arm64.checked.mch +0.01%

Details here


Throughput diffs for linux/arm64 ran on linux/x64

MinOpts (-0.01% to -0.00%)
Collection PDIFF
smoke_tests.nativeaot.linux.arm64.checked.mch -0.01%
benchmarks.run.linux.arm64.checked.mch -0.01%
libraries.crossgen2.linux.arm64.checked.mch -0.01%

Details here


@ryujit-bot
Copy link

Diff results for #98248

Throughput diffs

Throughput diffs for linux/arm64 ran on windows/x64

MinOpts (+0.00% to +0.01%)
Collection PDIFF
realworld.run.linux.arm64.checked.mch +0.01%

Details here


Throughput diffs for linux/arm64 ran on linux/x64

MinOpts (-0.01% to -0.00%)
Collection PDIFF
libraries.crossgen2.linux.arm64.checked.mch -0.01%
benchmarks.run.linux.arm64.checked.mch -0.01%
smoke_tests.nativeaot.linux.arm64.checked.mch -0.01%

Details here


@kunalspathak kunalspathak merged commit 5efaf50 into dotnet:main Feb 13, 2024
127 of 129 checks passed
@amanasifkhalid amanasifkhalid deleted the sve-fr-2a branch February 13, 2024 16:16
@github-actions github-actions bot locked and limited conversation to collaborators Mar 16, 2024
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI arm-sve Work related to arm64 SVE/SVE2 support
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants