From 18885d93b718016661741781fc7d28dd91573aec Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 31 Aug 2023 22:18:25 +0200
Subject: [PATCH] aarch64: Use `RegScaled*` addressing modes (#6945)

This commit adds a few cases to `amode` construction on AArch64 for
using the `RegScaled*` variants of `AMode`. This won't affect wasm due
to this only matching the sign-extension happening before the shift, but
it should otherwise help non-wasm Cranelift use cases.

Closes #6742
---
 cranelift/codegen/src/isa/aarch64/inst.isle   |  26 +-
 cranelift/codegen/src/prelude.isle            |   1 +
 .../filetests/isa/aarch64/amodes.clif         | 266 ++++++++++++++++++
 3 files changed, 291 insertions(+), 2 deletions(-)

diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index 806387d9183a..edfa31432ecf 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -3122,15 +3122,37 @@
 (rule 5 (amode ty (iadd (sextend x @ (value_type $I32)) y) offset)
         (AMode.RegExtended (amode_add y offset) x (ExtendOp.SXTW)))
 
+;; `RegScaled*` rules where this matches an addition of an "index register" to a
+;; base register. The index register is shifted by the size of the type loaded
+;; in bytes to enable this mode matching.
+;;
+;; Note that this can additionally bundle an extending operation but the
+;; extension must happen before the shift. This will pattern-match the shift
+;; first and then if that succeeds afterwards try to find an extend.
+(rule 6 (amode ty (iadd x (ishl y (iconst (u64_from_imm64 n)))) offset)
+        (if-let $true (u64_eq (ty_bytes ty) (u64_shl 1 n)))
+        (amode_reg_scaled (amode_add x offset) y ty))
+(rule 7 (amode ty (iadd (ishl y (iconst (u64_from_imm64 n))) x) offset)
+        (if-let $true (u64_eq (ty_bytes ty) (u64_shl 1 n)))
+        (amode_reg_scaled (amode_add x offset) y ty))
+
+(decl amode_reg_scaled (Reg Value Type) AMode)
+(rule 0 (amode_reg_scaled base index ty)
+        (AMode.RegScaled base index ty))
+(rule 1 (amode_reg_scaled base (uextend index @ (value_type $I32)) ty)
+        (AMode.RegScaledExtended base index ty (ExtendOp.UXTW)))
+(rule 2 (amode_reg_scaled base (sextend index @ (value_type $I32)) ty)
+        (AMode.RegScaledExtended base index ty (ExtendOp.SXTW)))
+
 ;; Small optimizations where constants found in `iadd` are folded into the
 ;; `offset` immediate.
 ;;
 ;; NB: this should probably be done by mid-end optimizations rather than here
 ;; in the backend, but currently Cranelift doesn't do that.
-(rule 6 (amode ty (iadd x (iconst (simm32 y))) offset)
+(rule 8 (amode ty (iadd x (iconst (simm32 y))) offset)
         (if-let new_offset (s32_add_fallible y offset))
         (amode ty x new_offset))
-(rule 7 (amode ty (iadd (iconst (simm32 x)) y) offset)
+(rule 9 (amode ty (iadd (iconst (simm32 x)) y) offset)
         (if-let new_offset (s32_add_fallible x offset))
         (amode ty y new_offset))
 
diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle
index d83d4b518e86..2f4b720b1063 100644
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -87,6 +87,7 @@
 
 (decl pure u16_as_u64 (u16) u64)
 (extern constructor u16_as_u64 u16_as_u64)
+(convert u16 u64 u16_as_u64)
 
 (decl pure u32_as_u64 (u32) u64)
 (extern constructor u32_as_u64 u32_as_u64)
diff --git a/cranelift/filetests/filetests/isa/aarch64/amodes.clif b/cranelift/filetests/filetests/isa/aarch64/amodes.clif
index 1bb7e0b2333e..176f70da7bd3 100644
--- a/cranelift/filetests/filetests/isa/aarch64/amodes.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/amodes.clif
@@ -519,3 +519,269 @@ block0(v0: i64, v1: i32):
 ;   stp x0, x1, [x6]
 ;   ret
 
+function %load_scaled16(i64, i64) -> i8 {
+block0(v0: i64, v1: i64):
+  v2 = ishl_imm v1, 0
+  v3 = iadd v0, v2
+  v4 = load.i8 v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   ldrb w0, [x0, x1, LSL #0]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldrb w0, [x0, x1, lsl #0]
+;   ret
+
+function %load_scaled16(i64, i64) -> i16 {
+block0(v0: i64, v1: i64):
+  v2 = ishl_imm v1, 1
+  v3 = iadd v0, v2
+  v4 = load.i16 v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   ldrh w0, [x0, x1, LSL #1]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldrh w0, [x0, x1, lsl #1]
+;   ret
+
+function %load_scaled32(i64, i64) -> i32 {
+block0(v0: i64, v1: i64):
+  v2 = ishl_imm v1, 2
+  v3 = iadd v0, v2
+  v4 = load.i32 v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   ldr w0, [x0, x1, LSL #2]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr w0, [x0, x1, lsl #2]
+;   ret
+
+function %load_scaled64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = ishl_imm v1, 3
+  v3 = iadd v0, v2
+  v4 = load.i64 v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   ldr x0, [x0, x1, LSL #3]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr x0, [x0, x1, lsl #3]
+;   ret
+
+function %load_not_scaled64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = ishl_imm v1, 2
+  v3 = iadd v0, v2
+  v4 = load.i64 v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   lsl x4, x1, #2
+;   ldr x0, [x0, x4]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   lsl x4, x1, #2
+;   ldr x0, [x0, x4]
+;   ret
+
+function %load_uextend_scaled16(i64, i32) -> i8 {
+block0(v0: i64, v1: i32):
+  v2 = uextend.i64 v1
+  v3 = ishl_imm v2, 0
+  v4 = iadd v0, v3
+  v5 = load.i8 v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   ldrb w0, [x0, w1, UXTW #0]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldrb w0, [x0, w1, uxtw #0]
+;   ret
+
+function %load_uextend_scaled16(i64, i32) -> i16 {
+block0(v0: i64, v1: i32):
+  v2 = uextend.i64 v1
+  v3 = ishl_imm v2, 1
+  v4 = iadd v0, v3
+  v5 = load.i16 v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   ldrh w0, [x0, w1, UXTW #1]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldrh w0, [x0, w1, uxtw #1]
+;   ret
+
+function %load_uextend_scaled32(i64, i32) -> i32 {
+block0(v0: i64, v1: i32):
+  v2 = uextend.i64 v1
+  v3 = ishl_imm v2, 2
+  v4 = iadd v0, v3
+  v5 = load.i32 v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   ldr w0, [x0, w1, UXTW #2]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr w0, [x0, w1, uxtw #2]
+;   ret
+
+
+function %load_uextend_scaled64(i64, i32) -> i64 {
+block0(v0: i64, v1: i32):
+  v2 = uextend.i64 v1
+  v3 = ishl_imm v2, 3
+  v4 = iadd v0, v3
+  v5 = load.i64 v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   ldr x0, [x0, w1, UXTW #3]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr x0, [x0, w1, uxtw #3]
+;   ret
+
+function %load_not_extend_scaled64(i64, i32) -> i64 {
+block0(v0: i64, v1: i32):
+  v2 = ishl_imm v1, 3
+  v3 = uextend.i64 v2
+  v4 = iadd v0, v3
+  v5 = load.i64 v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   lsl w4, w1, #3
+;   ldr x0, [x0, w4, UXTW]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   lsl w4, w1, #3
+;   ldr x0, [x0, w4, uxtw]
+;   ret
+
+function %load_sextend_scaled8(i64, i32) -> i8 {
+block0(v0: i64, v1: i32):
+  v2 = sextend.i64 v1
+  v3 = ishl_imm v2, 0
+  v4 = iadd v0, v3
+  v5 = load.i8 v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   ldrb w0, [x0, w1, SXTW #0]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldrb w0, [x0, w1, sxtw #0]
+;   ret
+
+function %load_sextend_scaled16(i64, i32) -> i16 {
+block0(v0: i64, v1: i32):
+  v2 = sextend.i64 v1
+  v3 = ishl_imm v2, 1
+  v4 = iadd v0, v3
+  v5 = load.i16 v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   ldrh w0, [x0, w1, SXTW #1]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldrh w0, [x0, w1, sxtw #1]
+;   ret
+
+function %load_sextend_scaled32(i64, i32) -> i32 {
+block0(v0: i64, v1: i32):
+  v2 = sextend.i64 v1
+  v3 = ishl_imm v2, 2
+  v4 = iadd v0, v3
+  v5 = load.i32 v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   ldr w0, [x0, w1, SXTW #2]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr w0, [x0, w1, sxtw #2]
+;   ret
+
+function %load_sextend_scaled64(i64, i32) -> i64 {
+block0(v0: i64, v1: i32):
+  v2 = sextend.i64 v1
+  v3 = ishl_imm v2, 3
+  v4 = iadd v0, v3
+  v5 = load.i64 v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   ldr x0, [x0, w1, SXTW #3]
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   ldr x0, [x0, w1, sxtw #3]
+;   ret
+