From 6e983a7c924c166307361968e7ea98518859dab8 Mon Sep 17 00:00:00 2001 From: Radomir Djogo <159184120+rdjogoTT@users.noreply.github.com> Date: Thu, 12 Dec 2024 00:13:56 -0500 Subject: [PATCH] Add LLK and API for binary shift left/right (#15926) ### Ticket [Link to Github Issue](https://github.com/tenstorrent/tt-metal/issues/10034) ### What's changed Implemented binary left/right shift as binary SFPU OPs. ### Checklist - [x] Post commit CI passes: https://github.com/tenstorrent/tt-metal/actions/runs/12287866881 - [x] Blackhole Post commit: https://github.com/tenstorrent/tt-metal/actions/runs/12287867702 - [x] New/Existing tests provide coverage for changes - will be added in future PR --- .../llk_api/llk_sfpu/ckernel_sfpu_shift.h | 27 ++++++++ .../llk_math_eltwise_binary_sfpu_shift.h | 34 ++++++++++ .../llk_api/llk_sfpu/ckernel_sfpu_shift.h | 27 ++++++++ .../llk_math_eltwise_binary_sfpu_shift.h | 34 ++++++++++ .../compute_kernel_api/add_int32_sfpu.h | 2 + .../compute_kernel_api/binary_bitwise_sfpu.h | 2 + .../include/compute_kernel_api/binary_shift.h | 68 +++++++++++++++++++ .../compute_kernel_api/eltwise_binary_sfpu.h | 2 + tt_metal/third_party/tt_llk_blackhole | 2 +- tt_metal/third_party/tt_llk_wormhole_b0 | 2 +- 10 files changed, 198 insertions(+), 2 deletions(-) create mode 100644 tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h create mode 100644 tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h create mode 100644 tt_metal/include/compute_kernel_api/binary_shift.h diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h new file mode 100644 index 00000000000..ccd4b2e6df2 --- /dev/null +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + +template +inline void calculate_binary_left_shift(const uint dst_offset) { + _calculate_binary_left_shift_(dst_offset); +} + +template +inline void calculate_binary_right_shift(const uint dst_offset) { + _calculate_binary_right_shift_(dst_offset); +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h new file mode 100644 index 00000000000..337fdd9df5c --- /dev/null +++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_eltwise_binary_sfpu_init.h" +#include "llk_math_eltwise_binary_sfpu_params.h" +#include "ckernel_sfpu_shift.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_binary_sfpu_shift_init() { + llk_math_eltwise_binary_sfpu_init(); +} + +template +inline void llk_math_eltwise_binary_sfpu_left_shift( + uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { + llk_math_eltwise_binary_sfpu_params( + ckernel::sfpu::calculate_binary_left_shift, dst_index0, dst_index1, vector_mode); +} + +template +inline void llk_math_eltwise_binary_sfpu_right_shift( + uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { + llk_math_eltwise_binary_sfpu_params( + ckernel::sfpu::calculate_binary_right_shift, dst_index0, dst_index1, vector_mode); +} + +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h new file mode 100644 index 00000000000..ccd4b2e6df2 --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel { +namespace sfpu { + +template +inline void calculate_binary_left_shift(const uint dst_offset) { + _calculate_binary_left_shift_(dst_offset); +} + +template +inline void calculate_binary_right_shift(const uint dst_offset) { + _calculate_binary_right_shift_(dst_offset); +} + +} // namespace sfpu +} // namespace ckernel diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h new file mode 100644 index 00000000000..337fdd9df5c --- /dev/null +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llk_math_eltwise_binary_sfpu_init.h" +#include "llk_math_eltwise_binary_sfpu_params.h" +#include "ckernel_sfpu_shift.h" + +namespace ckernel { + +// New LLK SFPU APIs + +template +inline void llk_math_eltwise_binary_sfpu_shift_init() { + llk_math_eltwise_binary_sfpu_init(); +} + +template +inline void llk_math_eltwise_binary_sfpu_left_shift( + uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { + llk_math_eltwise_binary_sfpu_params( + ckernel::sfpu::calculate_binary_left_shift, dst_index0, dst_index1, vector_mode); +} + +template +inline void llk_math_eltwise_binary_sfpu_right_shift( + uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) { + llk_math_eltwise_binary_sfpu_params( + ckernel::sfpu::calculate_binary_right_shift, dst_index0, dst_index1, vector_mode); +} + +} // namespace ckernel diff --git a/tt_metal/include/compute_kernel_api/add_int32_sfpu.h b/tt_metal/include/compute_kernel_api/add_int32_sfpu.h index 4de5ee5b55a..f566c7e34da 100644 --- a/tt_metal/include/compute_kernel_api/add_int32_sfpu.h +++ b/tt_metal/include/compute_kernel_api/add_int32_sfpu.h @@ -19,6 +19,8 @@ namespace ckernel { * Performs an elementwise add operation with the two integer inputs: y = add(x0,x1) * Output overwrites first operand in DST. * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. * diff --git a/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h b/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h index cf2a20d0090..1ec6d40ceca 100644 --- a/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h +++ b/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h @@ -19,6 +19,8 @@ namespace ckernel { * Performs an elementwise binary bitwise operation with the two inputs: y = bitwise(x0,x1) * Output overwrites first operand in DST. * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. * diff --git a/tt_metal/include/compute_kernel_api/binary_shift.h b/tt_metal/include/compute_kernel_api/binary_shift.h new file mode 100644 index 00000000000..3bd2ddb9a59 --- /dev/null +++ b/tt_metal/include/compute_kernel_api/binary_shift.h @@ -0,0 +1,68 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "compute_kernel_api/common_globals.h" +#ifdef TRISC_MATH +#include "llk_math_eltwise_binary_sfpu_shift.h" +#define MAIN math_main() +#define MATH(x) x +#else +#define MATH(x) +#endif + +namespace ckernel { + +/** + * Performs an elementwise shift operation to the left on the input at idst0, by input at idst1: y = x0 << x1 + * Both inputs must be of Int32 data type only. Output overwrites first operand in DST. + * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. + * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, + * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. + * + * Return value: None + * + * | Argument | Description | Type | Valid Range | + * Required | + * |----------------|-----------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst0 | The index of the tile in DST register buffer to use as first operand | uint32_t | Must be less + * than the size of the DST register buffer | True | | idst1 | The index of the tile in DST register buffer + * to use as second operand | uint32_t | Must be less than the size of the DST register buffer | True | + */ +ALWI void binary_left_shift_tile(uint32_t idst0, uint32_t idst1) { + MATH((llk_math_eltwise_binary_sfpu_left_shift(idst0, idst1))); +} + +/** + * Performs an elementwise shift operation to the right on the input at idst0, by input at idst1: y = x0 >> x1 + * Both inputs must be of Int32 data type only. Output overwrites first operand in DST. + * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. + * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, + * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. + * + * Return value: None + * + * | Argument | Description | Type | Valid Range | + * Required | + * |----------------|-----------------------------------------------------------------------|----------|-------------------------------------------------------|----------| + * | idst0 | The index of the tile in DST register buffer to use as first operand | uint32_t | Must be less + * than the size of the DST register buffer | True | | idst1 | The index of the tile in DST register buffer + * to use as second operand | uint32_t | Must be less than the size of the DST register buffer | True | + */ + +ALWI void binary_right_shift_tile(uint32_t idst0, uint32_t idst1) { + MATH((llk_math_eltwise_binary_sfpu_right_shift(idst0, idst1))); +} + +/** + * Please refer to documentation for any_init. + */ +ALWI void binary_shift_tile_init() { MATH((llk_math_eltwise_binary_sfpu_shift_init())); } + +} // namespace ckernel diff --git a/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h b/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h index 22fc4c13fcf..23995891940 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h +++ b/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h @@ -19,6 +19,8 @@ namespace ckernel { * Performs an elementwise binop operation with the two floating point inputs: y = binop(x0,x1) * Output overwrites first operand in DST. * + * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available + * on the compute engine. * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles, * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats. * diff --git a/tt_metal/third_party/tt_llk_blackhole b/tt_metal/third_party/tt_llk_blackhole index 7536fbacd75..973288fb014 160000 --- a/tt_metal/third_party/tt_llk_blackhole +++ b/tt_metal/third_party/tt_llk_blackhole @@ -1 +1 @@ -Subproject commit 7536fbacd75a4ad62047c63c9c54176fae079e06 +Subproject commit 973288fb014a22ce72cdba1c38a9f41f48532d6d diff --git a/tt_metal/third_party/tt_llk_wormhole_b0 b/tt_metal/third_party/tt_llk_wormhole_b0 index 0f57d4e9dec..33a7f6a0267 160000 --- a/tt_metal/third_party/tt_llk_wormhole_b0 +++ b/tt_metal/third_party/tt_llk_wormhole_b0 @@ -1 +1 @@ -Subproject commit 0f57d4e9dec602b68671be8891e7af876285f275 +Subproject commit 33a7f6a026719af509a119d8a4e8e36c7c31854c