Skip to content

Commit

Permalink
QS8 Neon microkernels switch from x9 to x11 for params
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 382451240
  • Loading branch information
fbarchard authored and xnnpack-bot committed Jul 1, 2021
1 parent 898d585 commit 28138f1
Show file tree
Hide file tree
Showing 16 changed files with 118 additions and 118 deletions.
16 changes: 8 additions & 8 deletions src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x10
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11

# d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.

Expand All @@ -29,7 +29,7 @@

BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53

LDP x10, x9, [sp] // cn_stride, params
LDP x10, x11, [sp] // cn_stride, params

ADD x2, x2, 7 // kc = (kc + 7) & ~7
BIC x2, x2, 7
Expand Down Expand Up @@ -192,18 +192,18 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_pad
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
LD1R {v4.4s}, [x9], 4
LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
LD1R {v7.4s}, [x9], 4
LD1R {v7.4s}, [x11], 4
ADDP v0.4s, v16.4s, v20.4s
ADDP v1.4s, v24.4s, v28.4s

# Apply params - scale, shift, bias and clamp
SQRDMULH v0.4s, v0.4s, v4.4s
SQRDMULH v1.4s, v1.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
LD1R {v5.8h}, [x9], 2
LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
SSRA v0.4s, v6.4s, 31
Expand All @@ -215,10 +215,10 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_pad
SUBS x1, x1, 8
SQADD v0.8h, v0.8h, v5.8h
SQXTN v0.8b, v0.8h
LD1R {v1.16b}, [x9], 1
LD1R {v17.16b}, [x9]
LD1R {v1.16b}, [x11], 1
LD1R {v17.16b}, [x11]
SMAX v0.8b, v0.8b, v1.8b
SUB x9, x9, 11 // rewind params pointer
SUB x11, x11, 11 // rewind params pointer
SMIN v0.8b, v0.8b, v17.8b
B.LO 5f

Expand Down
16 changes: 8 additions & 8 deletions src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x10
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11

# d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.

Expand All @@ -28,7 +28,7 @@

BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}

LDP x10, x9, [sp] // cn_stride, params
LDP x10, x11, [sp] // cn_stride, params

ADD x2, x2, 7 // kc = (kc + 7) & ~7
BIC x2, x2, 7
Expand Down Expand Up @@ -167,18 +167,18 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_pad
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
LD1R {v4.4s}, [x9], 4
LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
LD1R {v7.4s}, [x9], 4
LD1R {v7.4s}, [x11], 4
ADDP v0.4s, v16.4s, v20.4s
ADDP v1.4s, v24.4s, v28.4s

# Apply params - scale, shift, bias and clamp
SQRDMULH v0.4s, v0.4s, v4.4s
SQRDMULH v1.4s, v1.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
LD1R {v5.8h}, [x9], 2
LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
SSRA v0.4s, v6.4s, 31
Expand All @@ -190,10 +190,10 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_pad
SUBS x1, x1, 8
SQADD v0.8h, v0.8h, v5.8h
SQXTN v0.8b, v0.8h
LD1R {v1.16b}, [x9], 1
LD1R {v17.16b}, [x9]
LD1R {v1.16b}, [x11], 1
LD1R {v17.16b}, [x11]
SMAX v0.8b, v0.8b, v1.8b
SUB x9, x9, 11 // rewind params pointer
SUB x11, x11, 11 // rewind params pointer
SMIN v0.8b, v0.8b, v17.8b
B.LO 5f

Expand Down
14 changes: 7 additions & 7 deletions src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11

# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.

Expand Down Expand Up @@ -58,7 +58,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_pa
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
LDP x10, x9, [sp, 48] // cn_stride, params
LDP x10, x11, [sp, 48] // cn_stride, params
MOV v31.16b, v30.16b

# Main loop - 16 bytes of A
Expand Down Expand Up @@ -126,10 +126,10 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_pa
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
LD1R {v4.4s}, [x9], 4
LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
LD1R {v7.4s}, [x9], 4
LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
Expand All @@ -145,7 +145,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_pa
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
LD1R {v5.8h}, [x9], 2
LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
Expand All @@ -167,8 +167,8 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_pa
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
LD1R {v1.16b}, [x9], 1
LD1R {v2.16b}, [x9]
LD1R {v1.16b}, [x11], 1
LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 2f
Expand Down
14 changes: 7 additions & 7 deletions src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11

# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.

Expand Down Expand Up @@ -61,7 +61,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_pad
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
LDP x10, x9, [sp, 80] // cn_stride, params
LDP x10, x11, [sp, 80] // cn_stride, params
MOV v31.16b, v30.16b
# Is there at least 16 bytes for epilogue?
B.LO 4f
Expand Down Expand Up @@ -269,10 +269,10 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_pad
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
LD1R {v4.4s}, [x9], 4
LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
LD1R {v7.4s}, [x9], 4
LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
Expand All @@ -288,7 +288,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_pad
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
LD1R {v5.8h}, [x9], 2
LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
Expand All @@ -310,8 +310,8 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_pad
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
LD1R {v1.16b}, [x9], 1
LD1R {v2.16b}, [x9]
LD1R {v1.16b}, [x11], 1
LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 5f
Expand Down
14 changes: 7 additions & 7 deletions src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11

# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.

Expand Down Expand Up @@ -59,7 +59,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_pad
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
LDP x10, x9, [sp, 64] // cn_stride, params
LDP x10, x11, [sp, 64] // cn_stride, params
MOV v31.16b, v30.16b
# Is there at least 16 bytes for epilogue?
B.LO 4f
Expand Down Expand Up @@ -223,10 +223,10 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_pad
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
LD1R {v4.4s}, [x9], 4
LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
LD1R {v7.4s}, [x9], 4
LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
Expand All @@ -242,7 +242,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_pad
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
LD1R {v5.8h}, [x9], 2
LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
Expand All @@ -264,8 +264,8 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mlal_pad
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
LD1R {v1.16b}, [x9], 1
LD1R {v2.16b}, [x9]
LD1R {v1.16b}, [x11], 1
LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 5f
Expand Down
14 changes: 7 additions & 7 deletions src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11

# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.

Expand Down Expand Up @@ -58,7 +58,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_pad
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
LDP x10, x9, [sp, 48] // cn_stride, params
LDP x10, x11, [sp, 48] // cn_stride, params
MOV v31.16b, v30.16b

# Main loop - 8 bytes of A
Expand Down Expand Up @@ -109,10 +109,10 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_pad
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
LD1R {v4.4s}, [x9], 4
LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
LD1R {v7.4s}, [x9], 4
LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
Expand All @@ -128,7 +128,7 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_pad
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
LD1R {v5.8h}, [x9], 2
LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
Expand All @@ -150,8 +150,8 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__aarch64_neon_mull_pad
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
LD1R {v1.16b}, [x9], 1
LD1R {v2.16b}, [x9]
LD1R {v1.16b}, [x11], 1
LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 2f
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x10
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11

# d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.

Expand All @@ -33,7 +33,7 @@

BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53

LDP x10, x9, [sp] // cn_stride, params
LDP x10, x11, [sp] // cn_stride, params

ADD x2, x2, 7 // kc = (kc + 7) & ~7
BIC x2, x2, 7
Expand Down Expand Up @@ -190,18 +190,18 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_pad
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
LD1R {v4.4s}, [x9], 4
LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
LD1R {v7.4s}, [x9], 4
LD1R {v7.4s}, [x11], 4
ADDP v0.4s, v16.4s, v20.4s
ADDP v1.4s, v24.4s, v28.4s

# Apply params - scale, shift, bias and clamp
SQRDMULH v0.4s, v0.4s, v4.4s
SQRDMULH v1.4s, v1.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
LD1R {v5.8h}, [x9], 2
LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
SSRA v0.4s, v6.4s, 31
Expand All @@ -213,10 +213,10 @@ BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_pad
SUBS x1, x1, 8
SQADD v0.8h, v0.8h, v5.8h
SQXTN v0.8b, v0.8h
LD1R {v1.16b}, [x9], 1
LD1R {v17.16b}, [x9]
LD1R {v1.16b}, [x11], 1
LD1R {v17.16b}, [x11]
SMAX v0.8b, v0.8b, v1.8b
SUB x9, x9, 11 // rewind params pointer
SUB x11, x11, 11 // rewind params pointer
SMIN v0.8b, v0.8b, v17.8b
B.LO 5f

Expand Down
Loading

0 comments on commit 28138f1

Please sign in to comment.