tenstorrent#13644: Add support for tensor-scalar binary ops

ct-clmsn · Nov 9, 2024 · 9b443e6 · 9b443e6
1 parent 6530072
commit 9b443e6
Show file tree

Hide file tree

Showing 12 changed files with 379 additions and 238 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -3,7 +3,7 @@ Checks: >
   performance-*,
   modernize-*,
   readability-*,
-  cppcoreguidelines-*
+  cppcoreguidelines-*,
   -modernize-use-trailing-return-type
 
 CheckOptions:

diff --git a/...ta_movement/bcast/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp b/...ta_movement/bcast/device/kernels/dataflow/reader_bcast_scalar_interleaved_partitioned.cpp
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"
+
+
+
+void kernel_main() {
+    auto src0_addr          = get_arg_val<uint32_t>(0);
+    auto packed_scalar      = get_arg_val<uint32_t>(1);
+    auto num_tiles          = get_arg_val<uint32_t>(2);
+    auto HtWt               = get_arg_val<uint32_t>(3);
+    auto base_start_id_HtWt = get_arg_val<uint32_t>(4);
+    auto curr_id_from_base  = get_arg_val<uint32_t>(5);
+    auto bcast_id           = get_arg_val<uint32_t>(6);
+
+    #ifndef  IN0_SHARDED
+    constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
+    #endif
+
+    constexpr uint32_t cb_id_in0 = 0;
+    constexpr uint32_t cb_id_in1 = 1;
+    constexpr uint32_t onetile = 1;
+
+    // single-tile ublocks
+    const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
+    const DataFormat in0_data_format = get_dataformat(cb_id_in0);
+    const DataFormat in1_data_format = DataFormat::Float16_b;
+
+    uint32_t l1_write_addr_in0;
+    uint32_t l1_write_addr_in1;
+
+    #ifndef IN0_SHARDED
+    const InterleavedAddrGenFast<src0_is_dram> s0 = {
+        .bank_base_address = src0_addr,
+        .page_size = in0_tile_bytes,
+        .data_format = in0_data_format
+    };
+    #else
+        cb_reserve_back(cb_id_in0, num_tiles);
+        cb_push_back(cb_id_in0, num_tiles);
+    #endif
+
+    generate_bcast_unary_scalar(cb_id_in1, packed_scalar);
+
+    for (uint32_t i = 0; i < num_tiles; i++) {
+        uint32_t curr_id = base_start_id_HtWt + curr_id_from_base;
+
+        #ifndef IN0_SHARDED
+        cb_reserve_back(cb_id_in0, onetile);
+        l1_write_addr_in0 = get_write_ptr(cb_id_in0);
+        noc_async_read_tile(curr_id, s0, l1_write_addr_in0);
+        noc_async_read_barrier();
+        cb_push_back(cb_id_in0, onetile);
+        #endif
+
+        curr_id_from_base++;
+
+        if (curr_id_from_base == HtWt) {
+            base_start_id_HtWt += HtWt;
+            curr_id_from_base = 0;
+        }
+    }
+}
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp
@@ -98,11 +98,8 @@ inline Tensor binary_impl(
     return output_tensor;
 }
 
-template<BinaryOpType binary_op_type>
-auto preprocess_inputs(
-    const Tensor& input_tensor_a_arg,
-    const Tensor& input_tensor_b_arg) {
-
+template <BinaryOpType binary_op_type>
+auto preprocess_inputs(const Tensor &input_tensor_a_arg, const Tensor &input_tensor_b_arg) {
     Tensor input_tensor_a = input_tensor_a_arg;
     Tensor input_tensor_b = input_tensor_b_arg;
 
@@ -149,8 +146,8 @@ Tensor BinaryOperation<binary_op_type>::invoke(
     std::optional<Tensor> optional_output_tensor,
     std::optional<unary::FusedActivations> activations,
     std::optional<unary::UnaryWithParam> input_tensor_a_activation) {
-
-    auto [input_tensor_a, input_tensor_b] = detail::preprocess_inputs<binary_op_type>(input_tensor_a_arg, input_tensor_b_arg);
+    auto [input_tensor_a, input_tensor_b] =
+        detail::preprocess_inputs<binary_op_type>(input_tensor_a_arg, input_tensor_b_arg);
 
     return ttnn::prim::binary(
         queue_id,
@@ -184,53 +181,44 @@ Tensor BinaryOperation<binary_op_type>::invoke(
         input_tensor_a_activation);
 }
 
-// TODO: this case should use BinaryWithScalarProgramConfig and there should be a custom kernel to run this
-// Currently, this is exactly how tt::tt_metal::add_unary works
 template <BinaryOpType binary_op_type>
 Tensor BinaryOperation<binary_op_type>::invoke(
+    uint8_t queue_id,
     const ttnn::Tensor &input_tensor_a,
-    const float scalar,
-    const std::optional<const DataType> &dtype,
+    float scalar,
+    const std::optional<const DataType> &output_dtype,
     const std::optional<ttnn::MemoryConfig> &memory_config,
     const std::optional<Tensor> &optional_output_tensor,
     std::optional<unary::FusedActivations> activations,
     std::optional<unary::UnaryWithParam> input_tensor_a_activation) {
-    return BinaryOperation::invoke(
-        DefaultQueueId,
+    return ttnn::prim::binary(
+        queue_id,
         input_tensor_a,
         scalar,
-        dtype,
+        binary_op_type,
+        output_dtype,
         memory_config,
         optional_output_tensor,
         activations,
         input_tensor_a_activation);
 }
 
+// TODO: this case should use BinaryWithScalarProgramConfig and there should be a custom kernel to run this
+// Currently, this is exactly how tt::tt_metal::add_unary works
 template <BinaryOpType binary_op_type>
 Tensor BinaryOperation<binary_op_type>::invoke(
-    uint8_t queue_id,
     const ttnn::Tensor &input_tensor_a,
-    const float scalar,
-    const std::optional<const DataType> &dtype,
+    float scalar,
+    const std::optional<const DataType> &output_dtype,
     const std::optional<ttnn::MemoryConfig> &memory_config,
     const std::optional<Tensor> &optional_output_tensor,
     std::optional<unary::FusedActivations> activations,
     std::optional<unary::UnaryWithParam> input_tensor_a_activation) {
-    using namespace tt::constants;
-    // Cast Float Scalar to a device tensor
-    auto host_buffer = owned_buffer::create<::bfloat16>(static_cast<std::size_t>(TILE_HEIGHT * TILE_WIDTH));
-    host_buffer[0] = scalar;
-    Tensor scalar_tensor_host = Tensor(
-        OwnedStorage{host_buffer},
-        ttnn::Shape(std::array<std::uint32_t, 2>{1, 1}, std::array<std::uint32_t, 2>{TILE_HEIGHT, TILE_WIDTH}),
-        DataType::BFLOAT16,
-        Layout::TILE);
-    Tensor scalar_tensor_device = scalar_tensor_host.to(input_tensor_a.device());
-    // TODO(arakhmati): #7637 pass in memory_config instead of operation::DEFAULT_OUTPUT_MEMORY_CONFIG
     return BinaryOperation::invoke(
+        DefaultQueueId,
         input_tensor_a,
-        scalar_tensor_device,
-        dtype,
+        scalar,
+        output_dtype,
         memory_config,
         optional_output_tensor,
         activations,
@@ -253,7 +241,8 @@ Tensor RelationalBinary<binary_op_type>::invoke(
             "If both output dtype and output tensor provided dtype should match");
     }
 
-    auto [input_tensor_a, input_tensor_b] = detail::preprocess_inputs<binary_op_type>(input_tensor_a_arg, input_tensor_b_arg);
+    auto [input_tensor_a, input_tensor_b] =
+        detail::preprocess_inputs<binary_op_type>(input_tensor_a_arg, input_tensor_b_arg);
 
     auto output_memory_config = memory_config.value_or(input_tensor_a.memory_config());
     DataType dtype = output_dtype.value_or(input_tensor_a.get_dtype());
@@ -334,25 +323,34 @@ Tensor RelationalBinary<binary_op_type>::invoke(
 
 template <BinaryOpType binary_op_type>
 Tensor InplaceRelationalBinary<binary_op_type>::invoke(
-    const Tensor &input_tensor_a_arg,
-    const Tensor &input_tensor_b_arg) {
-
-    return RelationalBinary<binary_op_type>::invoke(input_tensor_a_arg, input_tensor_b_arg, std::nullopt, std::nullopt, input_tensor_a_arg, std::nullopt, std::nullopt);
+    const Tensor &input_tensor_a_arg, const Tensor &input_tensor_b_arg) {
+    return RelationalBinary<binary_op_type>::invoke(
+        input_tensor_a_arg,
+        input_tensor_b_arg,
+        std::nullopt,
+        std::nullopt,
+        input_tensor_a_arg,
+        std::nullopt,
+        std::nullopt);
 }
 
 template <BinaryOpType binary_op_type>
-Tensor InplaceRelationalBinary<binary_op_type>::invoke(
-    const ttnn::Tensor &input_tensor_a,
-    const float scalar) {
-    return RelationalBinary<binary_op_type>::invoke(input_tensor_a, scalar, std::nullopt, std::nullopt, input_tensor_a, std::nullopt, std::nullopt);
+Tensor InplaceRelationalBinary<binary_op_type>::invoke(const ttnn::Tensor &input_tensor_a, const float scalar) {
+    return RelationalBinary<binary_op_type>::invoke(
+        input_tensor_a, scalar, std::nullopt, std::nullopt, input_tensor_a, std::nullopt, std::nullopt);
 }
 
 template <BinaryOpType binary_op_type>
 Tensor InplaceLogicalBinary<binary_op_type>::invoke(
-    const Tensor &input_tensor_a_arg,
-    const Tensor &input_tensor_b_arg) {
-
-    return BinaryOperation<binary_op_type>::invoke(input_tensor_a_arg, input_tensor_b_arg, std::nullopt, std::nullopt, input_tensor_a_arg, std::nullopt, std::nullopt);
+    const Tensor &input_tensor_a_arg, const Tensor &input_tensor_b_arg) {
+    return BinaryOperation<binary_op_type>::invoke(
+        input_tensor_a_arg,
+        input_tensor_b_arg,
+        std::nullopt,
+        std::nullopt,
+        input_tensor_a_arg,
+        std::nullopt,
+        std::nullopt);
 }
 
 template <BinaryOpType binary_op_type>
@@ -361,8 +359,14 @@ Tensor InplaceBinaryOperation<binary_op_type>::invoke(
     const Tensor &input_tensor_b_arg,
     std::optional<unary::FusedActivations> activations,
     std::optional<unary::UnaryWithParam> input_tensor_a_activation) {
-
-    return BinaryOperation<binary_op_type>::invoke(input_tensor_a_arg, input_tensor_b_arg, std::nullopt, std::nullopt, input_tensor_a_arg, activations, input_tensor_a_activation);
+    return BinaryOperation<binary_op_type>::invoke(
+        input_tensor_a_arg,
+        input_tensor_b_arg,
+        std::nullopt,
+        std::nullopt,
+        input_tensor_a_arg,
+        activations,
+        input_tensor_a_activation);
 }
 
 template <BinaryOpType binary_op_type>
@@ -371,7 +375,8 @@ Tensor InplaceBinaryOperation<binary_op_type>::invoke(
     const float scalar,
     std::optional<unary::FusedActivations> activations,
     std::optional<unary::UnaryWithParam> input_tensor_a_activation) {
-    return BinaryOperation<binary_op_type>::invoke(input_tensor_a, scalar, std::nullopt, std::nullopt, input_tensor_a, activations, input_tensor_a_activation);
+    return BinaryOperation<binary_op_type>::invoke(
+        input_tensor_a, scalar, std::nullopt, std::nullopt, input_tensor_a, activations, input_tensor_a_activation);
 }
 
 template struct BinaryOperation<BinaryOpType::ADD>;
@@ -403,7 +408,6 @@ template struct InplaceRelationalBinary<BinaryOpType::LTE>;
 template struct InplaceRelationalBinary<BinaryOpType::EQ>;
 template struct InplaceRelationalBinary<BinaryOpType::NE>;
 
-
 template struct InplaceLogicalBinary<BinaryOpType::LOGICAL_AND>;
 template struct InplaceLogicalBinary<BinaryOpType::LOGICAL_OR>;
 

diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp
@@ -39,22 +39,20 @@ struct BinaryOperation {
         std::optional<unary::FusedActivations> activations = std::nullopt,
         std::optional<unary::UnaryWithParam> input_tensor_a_activation = std::nullopt);
 
-    // TODO: this case should use BinaryWithScalarProgramConfig and there should be a custom kernel to run this
-    // Currently, this is exactly how tt::tt_metal::add_unary works
     static Tensor invoke(
+        uint8_t queue_id,
         const ttnn::Tensor &input_tensor_a,
-        const float scalar,
-        const std::optional<const DataType> &dtype = std::nullopt,
+        float scalar,
+        const std::optional<const DataType> &output_dtype = std::nullopt,
         const std::optional<ttnn::MemoryConfig> &memory_config = std::nullopt,
         const std::optional<Tensor> &optional_output_tensor = std::nullopt,
         std::optional<unary::FusedActivations> activations = std::nullopt,
         std::optional<unary::UnaryWithParam> input_tensor_a_activation = std::nullopt);
 
     static Tensor invoke(
-        uint8_t queue_id,
         const ttnn::Tensor &input_tensor_a,
-        const float scalar,
-        const std::optional<const DataType> &dtype = std::nullopt,
+        float scalar,
+        const std::optional<const DataType> &output_dtype = std::nullopt,
         const std::optional<ttnn::MemoryConfig> &memory_config = std::nullopt,
         const std::optional<Tensor> &optional_output_tensor = std::nullopt,
         std::optional<unary::FusedActivations> activations = std::nullopt,