Skip to content

Commit

Permalink
tenstorrent#13644: Add support for tensor-scalar binary ops
Browse files Browse the repository at this point in the history
  • Loading branch information
yan-zaretskiy authored and Christopher Taylor committed Nov 9, 2024
1 parent 6530072 commit 9b443e6
Show file tree
Hide file tree
Showing 12 changed files with 379 additions and 238 deletions.
2 changes: 1 addition & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Checks: >
performance-*,
modernize-*,
readability-*,
cppcoreguidelines-*
cppcoreguidelines-*,
-modernize-use-trailing-return-type
CheckOptions:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <stdint.h>
#include "dataflow_api.h"
#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/generate_bcast_scalar.hpp"



void kernel_main() {
auto src0_addr = get_arg_val<uint32_t>(0);
auto packed_scalar = get_arg_val<uint32_t>(1);
auto num_tiles = get_arg_val<uint32_t>(2);
auto HtWt = get_arg_val<uint32_t>(3);
auto base_start_id_HtWt = get_arg_val<uint32_t>(4);
auto curr_id_from_base = get_arg_val<uint32_t>(5);
auto bcast_id = get_arg_val<uint32_t>(6);

#ifndef IN0_SHARDED
constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1;
#endif

constexpr uint32_t cb_id_in0 = 0;
constexpr uint32_t cb_id_in1 = 1;
constexpr uint32_t onetile = 1;

// single-tile ublocks
const uint32_t in0_tile_bytes = get_tile_size(cb_id_in0);
const DataFormat in0_data_format = get_dataformat(cb_id_in0);
const DataFormat in1_data_format = DataFormat::Float16_b;

uint32_t l1_write_addr_in0;
uint32_t l1_write_addr_in1;

#ifndef IN0_SHARDED
const InterleavedAddrGenFast<src0_is_dram> s0 = {
.bank_base_address = src0_addr,
.page_size = in0_tile_bytes,
.data_format = in0_data_format
};
#else
cb_reserve_back(cb_id_in0, num_tiles);
cb_push_back(cb_id_in0, num_tiles);
#endif

generate_bcast_unary_scalar(cb_id_in1, packed_scalar);

for (uint32_t i = 0; i < num_tiles; i++) {
uint32_t curr_id = base_start_id_HtWt + curr_id_from_base;

#ifndef IN0_SHARDED
cb_reserve_back(cb_id_in0, onetile);
l1_write_addr_in0 = get_write_ptr(cb_id_in0);
noc_async_read_tile(curr_id, s0, l1_write_addr_in0);
noc_async_read_barrier();
cb_push_back(cb_id_in0, onetile);
#endif

curr_id_from_base++;

if (curr_id_from_base == HtWt) {
base_start_id_HtWt += HtWt;
curr_id_from_base = 0;
}
}
}
98 changes: 51 additions & 47 deletions ttnn/cpp/ttnn/operations/eltwise/binary/binary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,8 @@ inline Tensor binary_impl(
return output_tensor;
}

template<BinaryOpType binary_op_type>
auto preprocess_inputs(
const Tensor& input_tensor_a_arg,
const Tensor& input_tensor_b_arg) {

template <BinaryOpType binary_op_type>
auto preprocess_inputs(const Tensor &input_tensor_a_arg, const Tensor &input_tensor_b_arg) {
Tensor input_tensor_a = input_tensor_a_arg;
Tensor input_tensor_b = input_tensor_b_arg;

Expand Down Expand Up @@ -149,8 +146,8 @@ Tensor BinaryOperation<binary_op_type>::invoke(
std::optional<Tensor> optional_output_tensor,
std::optional<unary::FusedActivations> activations,
std::optional<unary::UnaryWithParam> input_tensor_a_activation) {

auto [input_tensor_a, input_tensor_b] = detail::preprocess_inputs<binary_op_type>(input_tensor_a_arg, input_tensor_b_arg);
auto [input_tensor_a, input_tensor_b] =
detail::preprocess_inputs<binary_op_type>(input_tensor_a_arg, input_tensor_b_arg);

return ttnn::prim::binary(
queue_id,
Expand Down Expand Up @@ -184,53 +181,44 @@ Tensor BinaryOperation<binary_op_type>::invoke(
input_tensor_a_activation);
}

// TODO: this case should use BinaryWithScalarProgramConfig and there should be a custom kernel to run this
// Currently, this is exactly how tt::tt_metal::add_unary works
template <BinaryOpType binary_op_type>
Tensor BinaryOperation<binary_op_type>::invoke(
uint8_t queue_id,
const ttnn::Tensor &input_tensor_a,
const float scalar,
const std::optional<const DataType> &dtype,
float scalar,
const std::optional<const DataType> &output_dtype,
const std::optional<ttnn::MemoryConfig> &memory_config,
const std::optional<Tensor> &optional_output_tensor,
std::optional<unary::FusedActivations> activations,
std::optional<unary::UnaryWithParam> input_tensor_a_activation) {
return BinaryOperation::invoke(
DefaultQueueId,
return ttnn::prim::binary(
queue_id,
input_tensor_a,
scalar,
dtype,
binary_op_type,
output_dtype,
memory_config,
optional_output_tensor,
activations,
input_tensor_a_activation);
}

// TODO: this case should use BinaryWithScalarProgramConfig and there should be a custom kernel to run this
// Currently, this is exactly how tt::tt_metal::add_unary works
template <BinaryOpType binary_op_type>
Tensor BinaryOperation<binary_op_type>::invoke(
uint8_t queue_id,
const ttnn::Tensor &input_tensor_a,
const float scalar,
const std::optional<const DataType> &dtype,
float scalar,
const std::optional<const DataType> &output_dtype,
const std::optional<ttnn::MemoryConfig> &memory_config,
const std::optional<Tensor> &optional_output_tensor,
std::optional<unary::FusedActivations> activations,
std::optional<unary::UnaryWithParam> input_tensor_a_activation) {
using namespace tt::constants;
// Cast Float Scalar to a device tensor
auto host_buffer = owned_buffer::create<::bfloat16>(static_cast<std::size_t>(TILE_HEIGHT * TILE_WIDTH));
host_buffer[0] = scalar;
Tensor scalar_tensor_host = Tensor(
OwnedStorage{host_buffer},
ttnn::Shape(std::array<std::uint32_t, 2>{1, 1}, std::array<std::uint32_t, 2>{TILE_HEIGHT, TILE_WIDTH}),
DataType::BFLOAT16,
Layout::TILE);
Tensor scalar_tensor_device = scalar_tensor_host.to(input_tensor_a.device());
// TODO(arakhmati): #7637 pass in memory_config instead of operation::DEFAULT_OUTPUT_MEMORY_CONFIG
return BinaryOperation::invoke(
DefaultQueueId,
input_tensor_a,
scalar_tensor_device,
dtype,
scalar,
output_dtype,
memory_config,
optional_output_tensor,
activations,
Expand All @@ -253,7 +241,8 @@ Tensor RelationalBinary<binary_op_type>::invoke(
"If both output dtype and output tensor provided dtype should match");
}

auto [input_tensor_a, input_tensor_b] = detail::preprocess_inputs<binary_op_type>(input_tensor_a_arg, input_tensor_b_arg);
auto [input_tensor_a, input_tensor_b] =
detail::preprocess_inputs<binary_op_type>(input_tensor_a_arg, input_tensor_b_arg);

auto output_memory_config = memory_config.value_or(input_tensor_a.memory_config());
DataType dtype = output_dtype.value_or(input_tensor_a.get_dtype());
Expand Down Expand Up @@ -334,25 +323,34 @@ Tensor RelationalBinary<binary_op_type>::invoke(

template <BinaryOpType binary_op_type>
Tensor InplaceRelationalBinary<binary_op_type>::invoke(
const Tensor &input_tensor_a_arg,
const Tensor &input_tensor_b_arg) {

return RelationalBinary<binary_op_type>::invoke(input_tensor_a_arg, input_tensor_b_arg, std::nullopt, std::nullopt, input_tensor_a_arg, std::nullopt, std::nullopt);
const Tensor &input_tensor_a_arg, const Tensor &input_tensor_b_arg) {
return RelationalBinary<binary_op_type>::invoke(
input_tensor_a_arg,
input_tensor_b_arg,
std::nullopt,
std::nullopt,
input_tensor_a_arg,
std::nullopt,
std::nullopt);
}

template <BinaryOpType binary_op_type>
Tensor InplaceRelationalBinary<binary_op_type>::invoke(
const ttnn::Tensor &input_tensor_a,
const float scalar) {
return RelationalBinary<binary_op_type>::invoke(input_tensor_a, scalar, std::nullopt, std::nullopt, input_tensor_a, std::nullopt, std::nullopt);
Tensor InplaceRelationalBinary<binary_op_type>::invoke(const ttnn::Tensor &input_tensor_a, const float scalar) {
return RelationalBinary<binary_op_type>::invoke(
input_tensor_a, scalar, std::nullopt, std::nullopt, input_tensor_a, std::nullopt, std::nullopt);
}

template <BinaryOpType binary_op_type>
Tensor InplaceLogicalBinary<binary_op_type>::invoke(
const Tensor &input_tensor_a_arg,
const Tensor &input_tensor_b_arg) {

return BinaryOperation<binary_op_type>::invoke(input_tensor_a_arg, input_tensor_b_arg, std::nullopt, std::nullopt, input_tensor_a_arg, std::nullopt, std::nullopt);
const Tensor &input_tensor_a_arg, const Tensor &input_tensor_b_arg) {
return BinaryOperation<binary_op_type>::invoke(
input_tensor_a_arg,
input_tensor_b_arg,
std::nullopt,
std::nullopt,
input_tensor_a_arg,
std::nullopt,
std::nullopt);
}

template <BinaryOpType binary_op_type>
Expand All @@ -361,8 +359,14 @@ Tensor InplaceBinaryOperation<binary_op_type>::invoke(
const Tensor &input_tensor_b_arg,
std::optional<unary::FusedActivations> activations,
std::optional<unary::UnaryWithParam> input_tensor_a_activation) {

return BinaryOperation<binary_op_type>::invoke(input_tensor_a_arg, input_tensor_b_arg, std::nullopt, std::nullopt, input_tensor_a_arg, activations, input_tensor_a_activation);
return BinaryOperation<binary_op_type>::invoke(
input_tensor_a_arg,
input_tensor_b_arg,
std::nullopt,
std::nullopt,
input_tensor_a_arg,
activations,
input_tensor_a_activation);
}

template <BinaryOpType binary_op_type>
Expand All @@ -371,7 +375,8 @@ Tensor InplaceBinaryOperation<binary_op_type>::invoke(
const float scalar,
std::optional<unary::FusedActivations> activations,
std::optional<unary::UnaryWithParam> input_tensor_a_activation) {
return BinaryOperation<binary_op_type>::invoke(input_tensor_a, scalar, std::nullopt, std::nullopt, input_tensor_a, activations, input_tensor_a_activation);
return BinaryOperation<binary_op_type>::invoke(
input_tensor_a, scalar, std::nullopt, std::nullopt, input_tensor_a, activations, input_tensor_a_activation);
}

template struct BinaryOperation<BinaryOpType::ADD>;
Expand Down Expand Up @@ -403,7 +408,6 @@ template struct InplaceRelationalBinary<BinaryOpType::LTE>;
template struct InplaceRelationalBinary<BinaryOpType::EQ>;
template struct InplaceRelationalBinary<BinaryOpType::NE>;


template struct InplaceLogicalBinary<BinaryOpType::LOGICAL_AND>;
template struct InplaceLogicalBinary<BinaryOpType::LOGICAL_OR>;

Expand Down
12 changes: 5 additions & 7 deletions ttnn/cpp/ttnn/operations/eltwise/binary/binary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,20 @@ struct BinaryOperation {
std::optional<unary::FusedActivations> activations = std::nullopt,
std::optional<unary::UnaryWithParam> input_tensor_a_activation = std::nullopt);

// TODO: this case should use BinaryWithScalarProgramConfig and there should be a custom kernel to run this
// Currently, this is exactly how tt::tt_metal::add_unary works
static Tensor invoke(
uint8_t queue_id,
const ttnn::Tensor &input_tensor_a,
const float scalar,
const std::optional<const DataType> &dtype = std::nullopt,
float scalar,
const std::optional<const DataType> &output_dtype = std::nullopt,
const std::optional<ttnn::MemoryConfig> &memory_config = std::nullopt,
const std::optional<Tensor> &optional_output_tensor = std::nullopt,
std::optional<unary::FusedActivations> activations = std::nullopt,
std::optional<unary::UnaryWithParam> input_tensor_a_activation = std::nullopt);

static Tensor invoke(
uint8_t queue_id,
const ttnn::Tensor &input_tensor_a,
const float scalar,
const std::optional<const DataType> &dtype = std::nullopt,
float scalar,
const std::optional<const DataType> &output_dtype = std::nullopt,
const std::optional<ttnn::MemoryConfig> &memory_config = std::nullopt,
const std::optional<Tensor> &optional_output_tensor = std::nullopt,
std::optional<unary::FusedActivations> activations = std::nullopt,
Expand Down
Loading

0 comments on commit 9b443e6

Please sign in to comment.