diff --git a/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h b/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h index 3c57ca66b..b8aee2e4e 100644 --- a/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h +++ b/include/ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h @@ -58,7 +58,7 @@ class L1ChainConfig { std::unordered_set &memReconfigEdges); bool isEmpty() { return opL1MemSpecs.empty(); } - void addOpL1MemSpec(OpL1MemSpec &&spec) { + void addOpL1MemSpec(OpL1MemSpec spec) { assert(state == L1ChainState::InBuild); l1ChainedOps.insert(spec.op); opL1MemSpecs.push_back(std::move(spec)); diff --git a/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h b/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h index f453e9a1d..2392cd7c9 100644 --- a/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h +++ b/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h @@ -8,10 +8,43 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h" #include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h" +#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h" namespace mlir::tt::ttnn { class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy { +public: + struct OpMemSpec { + TTNNLayoutAttr layout; + // Minimum L1 memory usage required for scheduling the op + // given the layouts of all the ops that are already scheduled. + // + uint64_t requiredL1Usage; + }; + + // This struct is holding information about the greedily choosen + // configuration of the @baseOp: 1) layouts and 2) precedence. + // + // The @layouts represents the mapping between the op and its choosen + // layout. All the ops that are included in the @layouts map must be + // either @baseOp or its operand with legal L1 Interleaved output layout + // at the moment of analyzing the @baseOp. + // + // The @precedence represents the order of the op's operands in which they + // should be scheduled. Only op's operands that are included in the @layouts + // map are included in the @precedence. + // + struct OpConfig { + Operation *baseOp; + llvm::DenseMap layouts; + llvm::SmallVector precedence; + }; + + struct L1Usage { + size_t outputL1Usage; + size_t requiredL1Usage; + }; + public: L1InterleavedPolicy( Operation *rootOp, std::vector &l1ChainConfigs, @@ -22,7 +55,71 @@ class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy { : MemoryLayoutAnalysisPolicy(rootOp, l1ChainConfigs, legalLayouts, schedule, usableL1CacheSize) {} + /** + * Retrieve the greedy OpConfig for the given base operation + * and its opsL1Usage map. + * + * @param baseOp The base operation for which the greedy configuration is + * being determined. + * @param opsL1Usage A map between the operation and its output L1 usage. All + * operations included in the opsL1Usage map must be either the baseOp or its + * operand with a legal L1 Interleaved output layout at the time of analyzing + * the baseOp. + * @return The greedy OpConfig for the baseOp. + */ + OpConfig getGreedyConfig(Operation *baseOp, + llvm::DenseMap &opsL1Usage); + void run() final; + +private: + // Check if the op is analyzable. Op is analyzable if it has at least one + // legal layout. + bool isAnalyzable(Operation *op); + + // Fetch op's DRAM layout from legalLayouts. + bool hasDRAMBufferType(Operation *op); + TTNNLayoutAttr getDRAMLayout(Operation *op); + + // Fetch op's L1 Interleaved layout from legalLayouts. + bool hasL1BufferType(Operation *op); + TTNNLayoutAttr getL1InterleavedLayout(Operation *op); + + size_t getAvailableL1CacheSize() const { + // Figure out this const based on exec data, but will be replaced + // with API. + // + constexpr float tensorL1UsageCap = 0.75; + return tensorL1UsageCap * usableL1CacheSize; + } + + // Precedence schedule map for each operation. It contains the order + // in which operands need to be executed for each op. + llvm::DenseMap> precedenceMap; + + llvm::DenseSet visitedOps; + void buildSchedule(mlir::Operation *op, func::FuncOp &func) { + + // Schedule all the precedents of the current operation + // + visitedOps.insert(op); + for (Operation *precedent : precedenceMap[op]) { + if (!visitedOps.count(precedent)) { + buildSchedule(precedent, func); + } + } + + (*schedule)[func].push_back(op); + } + + void constructSchedule(func::FuncOp &func) { + func->walk([&](Operation *op) { + if (op->hasTrait()) { + Operation *outputOp = op->getOperand(0).getDefiningOp(); + buildSchedule(outputOp, func); + } + }); + } }; } // namespace mlir::tt::ttnn diff --git a/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h b/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h index e8b603815..bc6284c3a 100644 --- a/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h +++ b/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h @@ -6,10 +6,10 @@ #define TTMLIR_DIALECT_TTNN_ANALYSIS_MEMORYLAYOUTANALYSIS_H #include "mlir/Dialect/Func/IR/FuncOps.h" -#include "ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h" #include "ttmlir/Dialect/TTNN/Analysis/Edge.h" #include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h" #include "ttmlir/Dialect/TTNN/Analysis/TTNNAnalysis.h" +#include "ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h" namespace mlir::tt::ttnn { diff --git a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td index e45fba003..7d5b10abb 100644 --- a/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td +++ b/include/ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.td @@ -148,6 +148,9 @@ def TTNN_TTNNLayoutAttr: TTNN_Attr<"TTNNLayout", "ttnn_layout"> { bool hasShardedTensorMemoryLayout() const; bool hasShardedL1TensorMemoryLayout() const; bool hasInterleavedL1TensorMemoryLayout() const; + bool hasInterleavedDRAMTensorMemoryLayout() const; + bool hasL1BufferType() const; + bool hasDRAMBufferType() const; bool isTiled() const; Layout getLayout() const; Type getElementType() const; diff --git a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h index 636d5f623..58206039b 100644 --- a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h +++ b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h @@ -5,9 +5,8 @@ #ifndef TTMLIR_DIALECT_TTNN_PIPELINES_TTNNPIPELINES_H #define TTMLIR_DIALECT_TTNN_PIPELINES_TTNNPIPELINES_H -#include "ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h" +#include "ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h" #include "ttmlir/Dialect/TTNN/Utils/PassOverrides.h" -#include "ttmlir/Dialect/TTNN/Utils/Utils.h" #include "mlir/Pass/PassOptions.h" diff --git a/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h b/include/ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h similarity index 88% rename from include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h rename to include/ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h index 4a44e883d..5275e2340 100644 --- a/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h +++ b/include/ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 -#ifndef TTMLIR_DIALECT_TT_UTILS_MEMORYLAYOUTANALYSISPARAMS_H -#define TTMLIR_DIALECT_TT_UTILS_MEMORYLAYOUTANALYSISPARAMS_H +#ifndef TTMLIR_DIALECT_TTNN_UTILS_MEMORYLAYOUTANALYSISPARAMS_H +#define TTMLIR_DIALECT_TTNN_UTILS_MEMORYLAYOUTANALYSISPARAMS_H #include #include @@ -49,4 +49,4 @@ struct MemoryLayoutAnalysisPolicyTypeParser } // namespace mlir::tt -#endif // TTMLIR_DIALECT_TT_UTILS_MEMORYLAYOUTANALYSISPARAMS_H +#endif // TTMLIR_DIALECT_TTNN_UTILS_MEMORYLAYOUTANALYSISPARAMS_H diff --git a/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h b/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h index c474106e3..eccc62f26 100644 --- a/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h +++ b/include/ttmlir/Dialect/TTNN/Utils/OptimizerOverrides.h @@ -5,8 +5,8 @@ #ifndef TTMLIR_DIALECT_TTNN_UTILS_OPTIMIZEROVERRIDES_H #define TTMLIR_DIALECT_TTNN_UTILS_OPTIMIZEROVERRIDES_H -#include "ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h" #include "ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h" +#include "ttmlir/Dialect/TTNN/Utils/MemoryLayoutAnalysisParams.h" #include "ttmlir/Dialect/TTNN/Utils/PassOverrides.h" namespace mlir::tt::ttnn { diff --git a/include/ttmlir/Scheduler/Scheduler.h b/include/ttmlir/Scheduler/Scheduler.h index 817271fdc..5d4116331 100644 --- a/include/ttmlir/Scheduler/Scheduler.h +++ b/include/ttmlir/Scheduler/Scheduler.h @@ -23,6 +23,10 @@ class Scheduler { // Method to get the next set of schedulable operations llvm::SmallVector getScheduleableOps(); + // Method to check if an operation is either a TTIR op or a + // TTNN scheduleable op. + bool isTTShedulableOp(mlir::Operation *op); + // Method to check if an operation can be scheduled bool canSchedule(mlir::Operation *op); diff --git a/lib/Dialect/TTNN/Analysis/CMakeLists.txt b/lib/Dialect/TTNN/Analysis/CMakeLists.txt index 996064d79..640702f71 100644 --- a/lib/Dialect/TTNN/Analysis/CMakeLists.txt +++ b/lib/Dialect/TTNN/Analysis/CMakeLists.txt @@ -15,6 +15,6 @@ add_mlir_dialect_library(MLIRTTNNAnalysis MLIRTTNNPassesIncGen MLIRTTOpsIncGen - LINK_LIBS + LINK_LIBS PUBLIC MLIRScheduler ) diff --git a/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp b/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp index c0b3ff102..23c1b306a 100644 --- a/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp +++ b/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp @@ -3,19 +3,23 @@ // SPDX-License-Identifier: Apache-2.0 #include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h" -#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h" -#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h" +#include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h" #include "ttmlir/Scheduler/Scheduler.h" namespace mlir::tt::ttnn { -uint64_t getOpOutputLayoutUsage( - Operation *op, - llvm::DenseMap> &legalLayouts, - DeviceAttr &deviceAttr) { - TTNNLayoutAttr opLayout = legalLayouts.lookup(op).front(); - assert(opLayout.hasInterleavedL1TensorMemoryLayout()); +uint64_t getOpOutputL1Usage(Operation *op, TTNNLayoutAttr opLayout, + DeviceAttr &deviceAttr) { + // In case the opLayout is not in L1 memory space, L1 memory usage is 0. + // + if (opLayout.hasDRAMBufferType()) { + return 0; + } + // L1 memory usage of the ops without output tensors cannot be calculated. + // So far, this is only false for ttnn.get_device op. + // + assert(mlir::isa(op->getResult(0).getType())); llvm::ArrayRef opOutputTensorShape = mlir::cast(op->getResult(0).getType()).getShape(); @@ -24,132 +28,327 @@ uint64_t getOpOutputLayoutUsage( return opL1OutputUsage; } -void L1InterleavedPolicy::run() { - rootOp->walk([&](func::FuncOp func) { - DeviceAttr deviceAttr = getCurrentScopeDevice(func); - mlir::tt::scheduler::Scheduler scheduler(&func); - llvm::SmallVector scheduleableOps; - llvm::DenseMap selectedOpLayout; - Operation *currentOp = nullptr; +L1InterleavedPolicy::OpConfig L1InterleavedPolicy::getGreedyConfig( + Operation *baseOp, llvm::DenseMap &opsL1Usage) { + uint64_t numOfOps, bitIndex, currentMask; + uint64_t currentL1Usage, optimalL1Usage; + llvm::DenseMap optimalLayouts; + llvm::SmallVector optimalPrecedence; + + constexpr uint64_t maxNumOfOps = sizeof(numOfOps) * 8; + numOfOps = opsL1Usage.size(); + assert(numOfOps <= maxNumOfOps); + + optimalL1Usage = 0; + for (currentMask = 0; currentMask < (1 << numOfOps); currentMask++) { + std::bitset bitset(currentMask); + llvm::DenseMap currentLayouts; + llvm::SmallVector currentPrecedence, optimalL1Precedence, + L1Precedence; - // TODO(fbajraktari): Add algorithm description. Currently, the algorithm - // is the same as for DFSharding policy, but works only for L1 interleaved. + // Calculate the L1 usage of the current configuration. // - l1ChainConfigs->push_back(L1ChainConfig()); - while (scheduler.hasUnscheduledOps()) { - scheduleableOps = scheduler.getScheduleableOps(); + currentL1Usage = 0; + bitIndex = 0; + for (const auto &[op, l1Usage] : opsL1Usage) { + if (bitset[bitIndex]) { + // In case we have an operand with L1 interleaved layout, we need to + // figure out its schedule among the other operands with L1 interleaved + // layout. Therefore, we insert all of them into the L1Precedence where + // calculate the optimal L1Precedence and then concatenate it with the + // currentPrecedence. + // + currentL1Usage += l1Usage.outputL1Usage; + currentLayouts[op] = getL1InterleavedLayout(op); + + // Skip the baseOp. + // + if (baseOp != op) { + L1Precedence.push_back(op); + } + } else { + // It is optimal to first schedule all ops with DRAM output layout. + // Therefore, we can directly insert them into the + // currentOptimalPrecedence. + // + currentLayouts[op] = getDRAMLayout(op); - // Before starting a l1 chain, schedule layout/memory management ops - // first until they are exhausted from schedulable ops. + // Skip the baseOp. + // + if (baseOp != op) { + currentPrecedence.push_back(op); + } + } + bitIndex += 1; + } + + // Calculate the optimal L1Precedence. + // + bool isMaskLegal = false; + uint64_t minRequiredL1Usage = getAvailableL1CacheSize(); + + std::sort(L1Precedence.begin(), L1Precedence.end()); + do { + // Check if the current order of L1Precedence is legal. // - if (l1ChainConfigs->back().isEmpty()) { - for (auto *op : scheduleableOps) { - if (isa(op)) { - currentOp = op; - break; - } + bool isLegal = true; + uint64_t intermediateL1Usage = 0; + uint64_t intermediateRequiredL1Usage = 0; + for (Operation *op : L1Precedence) { + if (intermediateL1Usage + opsL1Usage[op].requiredL1Usage > + getAvailableL1CacheSize()) { + isLegal = false; + break; } + + intermediateRequiredL1Usage = + std::max(intermediateRequiredL1Usage, + intermediateL1Usage + opsL1Usage[op].requiredL1Usage); + intermediateL1Usage += opsL1Usage[op].outputL1Usage; } - if (currentOp == nullptr) { - currentOp = scheduleableOps[0]; + // Pick optimal L1Precedence among all legal L1Precedence. + // The one that requires the least amount of L1 cache overall is + // considered optimal. + // + if (isLegal && intermediateRequiredL1Usage < minRequiredL1Usage) { + isMaskLegal = true; + minRequiredL1Usage = intermediateRequiredL1Usage; + optimalL1Precedence = L1Precedence; } + } while (std::next_permutation(L1Precedence.begin(), L1Precedence.end())); + + if (isMaskLegal && optimalL1Usage < currentL1Usage && + currentL1Usage <= getAvailableL1CacheSize()) { - // Schedule currentOp. + // Append the legal L1Precedence to the currentPrecedence and therefore + // create a complete precedence for the baseOp and currentMask. // - scheduler.scheduleOp(currentOp); + currentPrecedence.insert(currentPrecedence.end(), + optimalL1Precedence.begin(), + optimalL1Precedence.end()); - // Skip starting sharding chain if currentOp is a memory management op. + // Update the optimal configuration. // - if (l1ChainConfigs->back().isEmpty() && isa(currentOp)) { - currentOp = nullptr; - continue; - } + optimalL1Usage = currentL1Usage; + optimalLayouts = std::move(currentLayouts); + optimalPrecedence = std::move(currentPrecedence); + } + } - if (scheduler.hasUnscheduledOps()) { - scheduleableOps = scheduler.getScheduleableOps(); + // Create the optimal config. + // + OpConfig optimalConfig; + optimalConfig.baseOp = baseOp; + optimalConfig.layouts = std::move(optimalLayouts); + optimalConfig.precedence = std::move(optimalPrecedence); - // Check if currentOp has a valid successor. + return optimalConfig; +} + +void L1InterleavedPolicy::run() { + for (Operation &funcOp : rootOp->getRegion(0).getOps()) { + func::FuncOp func = dyn_cast(funcOp); + DeviceAttr deviceAttr = getCurrentScopeDevice(func); + + // Start the policy. + // + llvm::DenseMap OpMemSpecMap; + mlir::tt::scheduler::Scheduler scheduler(&func); + llvm::SmallVector scheduleableOps; + + while (scheduler.hasUnscheduledOps()) { + scheduleableOps = scheduler.getScheduleableOps(); + + for (Operation *op : scheduleableOps) { + // Schedule the op. // - Operation *nextOp = nullptr; - for (auto *op : scheduleableOps) { - for (auto operand : op->getOperands()) { - if (operand.getDefiningOp() == currentOp) { - nextOp = op; - break; - } + scheduler.scheduleOp(op); + + // Find optimal configuration for the op. + // + llvm::DenseMap opsL1Usage; + llvm::SmallVector opsPrecedence; + + // Generate optimal configuration for the current op based on the + // outputs of its operands and its legal output layouts. + // + if (isAnalyzable(op)) { + + // Create the OpMemSpec. + // + OpMemSpec OpMemSpec; + assert(hasDRAMBufferType(op)); + OpMemSpec.layout = getDRAMLayout(op); + OpMemSpec.requiredL1Usage = 0; + OpMemSpecMap[op] = OpMemSpec; + + if (op->hasOneUse() && hasL1BufferType(op)) { + L1Usage l1Usage; + l1Usage.outputL1Usage = + getOpOutputL1Usage(op, getL1InterleavedLayout(op), deviceAttr); + l1Usage.requiredL1Usage = 0; + opsL1Usage[op] = l1Usage; } } - if (nextOp) { + for (auto operand : op->getOperands()) { + // Skip block arguments (%arg0, %arg1, ...) + // + if (::llvm::isa(operand)) { + continue; + } - // V1: Check that currentOp is not fork/join op. + Operation *operandOp = operand.getDefiningOp(); + + // Skip non-analyzable operands. // - bool validForL1Interleaved = - currentOp->hasOneUse() && - legalLayouts.lookup(currentOp).size() > 0 && - legalLayouts.lookup(nextOp).size() > 0; - - if (validForL1Interleaved) { - // Figure out this const based on exec data, but will be replaced - // with API. + if (isAnalyzable(operandOp)) { + TTNNLayoutAttr operandOpLayout = OpMemSpecMap[operandOp].layout; + + // Take into consideration only the operands with L1 interleaved + // memory space. // - constexpr float tensorL1UsageCap = 0.8; - uint64_t currentOpL1OutputUsage = - getOpOutputLayoutUsage(currentOp, legalLayouts, deviceAttr); - uint64_t nextOpL1OutputUsage = - getOpOutputLayoutUsage(nextOp, legalLayouts, deviceAttr); - bool l1UsageValid = (currentOpL1OutputUsage + nextOpL1OutputUsage) < - tensorL1UsageCap * usableL1CacheSize; - - if (l1UsageValid) { - selectedOpLayout[currentOp] = - legalLayouts.lookup(currentOp).front(); - - // Add currentOp to l1 chain config. - // - OpL1MemSpec shardSpec; - shardSpec.op = currentOp; - - // Hardcoded tensor split factor for now, until pipeline OP - // support is added. - // - shardSpec.tensorSplitFactor = 1; - l1ChainConfigs->back().addOpL1MemSpec(std::move(shardSpec)); - - // Update currentOp pointer. - // - currentOp = nextOp; - continue; + if (operandOpLayout.hasInterleavedL1TensorMemoryLayout()) { + L1Usage l1Usage; + l1Usage.outputL1Usage = + getOpOutputL1Usage(operandOp, operandOpLayout, deviceAttr); + l1Usage.requiredL1Usage = OpMemSpecMap[operandOp].requiredL1Usage; + opsL1Usage[operandOp] = l1Usage; + } + // In case the operand has DRAM layout, we can insert it into the + // precedence directly. If the op is analyzable, it means that it + // is definitely schedulable. + // + else { + opsPrecedence.push_back(operandOp); + } + } + // In case the operand is not analyzable, i.e. there are no legal + // layouts for this operand, we can insert it into the precedence + // directly if it is schedulable since it does not use DRAM nor L1 + // memory. + // + else { + if (scheduler.isTTShedulableOp(operandOp)) { + opsPrecedence.push_back(operandOp); } } } - currentOp = nullptr; - if (!l1ChainConfigs->back().isEmpty()) { - l1ChainConfigs->back().build(); - l1ChainConfigs->push_back(L1ChainConfig()); + // Greedily find the optimal configuration. + // + OpConfig optimalConfig = getGreedyConfig(op, opsL1Usage); + for (const auto &[op, layout] : optimalConfig.layouts) { + OpMemSpecMap[op].layout = layout; + } + + // Override op's precedence. + // + opsPrecedence.insert(opsPrecedence.end(), + optimalConfig.precedence.begin(), + optimalConfig.precedence.end()); + precedenceMap[op] = std::move(opsPrecedence); + + // Update op's requiredL1Usage if the op is analyzable. + // + if (isAnalyzable(op)) { + uint64_t intermediateRequiredL1Usage = 0; + uint64_t intermediateL1Usage = 0; + for (auto operand : op->getOperands()) { + // Skip block arguments (%arg0, %arg1, ...) + // + if (::llvm::isa(operand)) { + continue; + } + + Operation *operandOp = operand.getDefiningOp(); + + // Skip non-analyzable operands. + // + if (isAnalyzable(operandOp)) { + intermediateRequiredL1Usage = + std::max(intermediateRequiredL1Usage, + intermediateL1Usage + + OpMemSpecMap[operandOp].requiredL1Usage); + intermediateL1Usage += getOpOutputL1Usage( + operandOp, OpMemSpecMap[operandOp].layout, deviceAttr); + } + } + OpMemSpecMap[op].requiredL1Usage = std::max( + intermediateRequiredL1Usage, + intermediateL1Usage + + getOpOutputL1Usage(op, OpMemSpecMap[op].layout, deviceAttr)); } } } - if (l1ChainConfigs->back().isEmpty()) { - l1ChainConfigs->pop_back(); - } + // Construct the schedule. + // + constructSchedule(func); - // Schedule + // Build, Resolve and Complete the L1 chain. + // This implementation is only here unitl we are able to merge + // L1ChainConfigs. + // TODO(fbajraktari): Fix this hack. // - (*schedule)[func] = scheduler.getSchedule(); + l1ChainConfigs->push_back(L1ChainConfig()); + llvm::DenseMap selectedOpLayout; + for (auto &OpMemSpec : OpMemSpecMap) { + OpL1MemSpec opL1MemSpec; + opL1MemSpec.op = OpMemSpec.first; + opL1MemSpec.tensorSplitFactor = 1; + selectedOpLayout[OpMemSpec.first] = OpMemSpec.second.layout; + l1ChainConfigs->back().addOpL1MemSpec(opL1MemSpec); + } + l1ChainConfigs->back().build(); + l1ChainConfigs->back().resolve(); + std::unordered_set memReconfigEdges; + l1ChainConfigs->back().complete(selectedOpLayout, memReconfigEdges); + } +} - // Resolve l1 chain configs. +bool L1InterleavedPolicy::isAnalyzable(Operation *op) { + // Skip operations that are not analyzed by the LegalGridAnalysis. + // + if (legalLayouts.count(op) > 0) { + // Skip operations that are filterd out by the MemoryLayoutAnalysis. // - for (auto &l1ChainConfig : *l1ChainConfigs) { - l1ChainConfig.resolve(); + return legalLayouts[op].size() > 0; + } + return false; +} - std::unordered_set memReconfigEdges; - l1ChainConfig.complete(selectedOpLayout, memReconfigEdges); - } - }); +bool L1InterleavedPolicy::hasDRAMBufferType(Operation *op) { + return std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(), + [](TTNNLayoutAttr layout) { + return layout.hasDRAMBufferType(); + }) != legalLayouts[op].end(); +} + +TTNNLayoutAttr L1InterleavedPolicy::getDRAMLayout(Operation *op) { + assert(hasDRAMBufferType(op)); + auto dramLayoutIter = std::find_if( + legalLayouts[op].begin(), legalLayouts[op].end(), + [](TTNNLayoutAttr layout) { return layout.hasDRAMBufferType(); }); + return *dramLayoutIter; +} + +bool L1InterleavedPolicy::hasL1BufferType(Operation *op) { + return std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(), + [](TTNNLayoutAttr layout) { + return layout.hasInterleavedL1TensorMemoryLayout(); + }) != legalLayouts[op].end(); +} + +TTNNLayoutAttr L1InterleavedPolicy::getL1InterleavedLayout(Operation *op) { + assert(hasL1BufferType(op)); + auto l1InterleaveLayoutIter = + std::find_if(legalLayouts[op].begin(), legalLayouts[op].end(), + [](TTNNLayoutAttr layout) { + return layout.hasInterleavedL1TensorMemoryLayout(); + }); + return *l1InterleaveLayoutIter; } } // namespace mlir::tt::ttnn diff --git a/lib/Dialect/TTNN/Analysis/LegalGridAnalysis.cpp b/lib/Dialect/TTNN/Analysis/LegalGridAnalysis.cpp index b01f4cf38..9bbbccf5e 100644 --- a/lib/Dialect/TTNN/Analysis/LegalGridAnalysis.cpp +++ b/lib/Dialect/TTNN/Analysis/LegalGridAnalysis.cpp @@ -115,6 +115,14 @@ void LegalGridAnalysis::analysisImplementation() { return; } + if (!isa(op->getResult(0).getType())) { + return; + } + + if (llvm::isa(op)) { + return; + } + // Get output tensor type. RankedTensorType tensorType = mlir::cast(op->getResult(0).getType()); diff --git a/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp b/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp index a89c5842b..f3db4ed7b 100644 --- a/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp +++ b/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp @@ -5,6 +5,7 @@ #include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h" #include "ttmlir/Dialect/TTNN/Analysis/DFShardingPolicy.h" #include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h" +#include "ttmlir/Dialect/TTNN/IR/TTNNOpsAttrs.h" namespace mlir::tt::ttnn { @@ -35,14 +36,15 @@ filterShardedOnly(const llvm::DenseMap> } llvm::DenseMap> -filterL1InterleavedOnly( +filterDRAMAndL1Interleaved( const llvm::DenseMap> &legalLayouts) { llvm::DenseMap> l1InterleavedLayouts; for (const auto &opLayouts : legalLayouts) { std::vector opL1InterleavedLayouts; for (const auto &layout : opLayouts.second) { - if (layout.hasInterleavedL1TensorMemoryLayout()) { + if (layout.hasDRAMBufferType() || + layout.hasInterleavedL1TensorMemoryLayout()) { opL1InterleavedLayouts.push_back(layout); } } @@ -68,7 +70,8 @@ void MemoryLayoutAnalysis::analysisImplementation() { } case MemoryLayoutAnalysisPolicyType::L1Interleaved: { L1InterleavedPolicy l1InterleavedPolicy( - op, l1ChainConfigs, filterL1InterleavedOnly(analysisInput.legalLayouts), + op, l1ChainConfigs, + filterDRAMAndL1Interleaved(analysisInput.legalLayouts), analysisResult.schedule, analysisInput.usableL1CacheSize); l1InterleavedPolicy.run(); break; diff --git a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp index 8aaae1261..10b54f418 100644 --- a/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp +++ b/lib/Dialect/TTNN/IR/TTNNOpsAttrs.cpp @@ -24,6 +24,11 @@ inline bool isDeviceBufferType(BufferType bufferType) { return bufferType == BufferType::DRAM || bufferType == BufferType::L1; } +// Check if tensor is in DRAM memory +inline bool isDRAMBufferType(BufferType bufferType) { + return bufferType == BufferType::DRAM; +} + // Check if tensor is in L1 memory inline bool isL1BufferType(BufferType bufferType) { return bufferType == BufferType::L1; @@ -39,6 +44,16 @@ Layout TTNNLayoutAttr::getLayout() const { return isTiled() ? Layout::Tile : Layout::RowMajor; } +// Check if the tensor memory buffer type is L1 +bool TTNNLayoutAttr::hasL1BufferType() const { + return isL1BufferType(getBufferType()); +} + +// Check if the tensor memory buffer type is DRAM +bool TTNNLayoutAttr::hasDRAMBufferType() const { + return isDRAMBufferType(getBufferType()); +} + // Check if the tensor memory layout is sharded bool TTNNLayoutAttr::hasShardedTensorMemoryLayout() const { return (getMemLayout() == TensorMemoryLayout::HeightSharded || @@ -48,7 +63,7 @@ bool TTNNLayoutAttr::hasShardedTensorMemoryLayout() const { // Check if the tensor memory layout is sharded in L1 memory bool TTNNLayoutAttr::hasShardedL1TensorMemoryLayout() const { - return isL1BufferType(getBufferType()) && + return hasL1BufferType() && (getMemLayout() == TensorMemoryLayout::HeightSharded || getMemLayout() == TensorMemoryLayout::WidthSharded || getMemLayout() == TensorMemoryLayout::BlockSharded); @@ -56,7 +71,13 @@ bool TTNNLayoutAttr::hasShardedL1TensorMemoryLayout() const { // Check if the tensor memory layout is interleaved and in L1 memory bool TTNNLayoutAttr::hasInterleavedL1TensorMemoryLayout() const { - return isL1BufferType(getBufferType()) && + return hasL1BufferType() && + (getMemLayout() == TensorMemoryLayout::Interleaved); +} + +// Check if the tensor memory layout is interleaved and in DRAM memory +bool TTNNLayoutAttr::hasInterleavedDRAMTensorMemoryLayout() const { + return hasDRAMBufferType() && (getMemLayout() == TensorMemoryLayout::Interleaved); } diff --git a/lib/Dialect/TTNN/Transforms/Optimizer.cpp b/lib/Dialect/TTNN/Transforms/Optimizer.cpp index e5d2f86d8..783f3ea07 100644 --- a/lib/Dialect/TTNN/Transforms/Optimizer.cpp +++ b/lib/Dialect/TTNN/Transforms/Optimizer.cpp @@ -170,6 +170,10 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase { return; } + if (llvm::isa(op)) { + return; + } + RankedTensorType tensorType = mlir::cast(op->getResult(0).getType()); LegalGridAnalysis legalGridAnalysis = diff --git a/lib/Scheduler/Scheduler.cpp b/lib/Scheduler/Scheduler.cpp index 25923fffd..52066c5e8 100644 --- a/lib/Scheduler/Scheduler.cpp +++ b/lib/Scheduler/Scheduler.cpp @@ -12,7 +12,8 @@ namespace mlir::tt::scheduler { -bool isTTNNOp(mlir::Operation *op) { +// TTNN op is scheduleable if it is not an EmptyOp and has at least one result. +bool isTTNNScheduleableOp(mlir::Operation *op) { return isa(op->getDialect()) && op->getNumResults() > 0 && !llvm::isa(op); } @@ -21,8 +22,8 @@ bool isTTIROp(mlir::Operation *op) { return isa(op->getDialect()); } -bool isTTShedulableOp(mlir::Operation *op) { - return isTTNNOp(op) || isTTIROp(op); +bool Scheduler::isTTShedulableOp(mlir::Operation *op) { + return isTTNNScheduleableOp(op) || isTTIROp(op); } // Init the dependencies map of all ops which are TTIR ops diff --git a/test/ttmlir/Silicon/TTNN/optimizer/all_l1_interleaved_policy.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir similarity index 79% rename from test/ttmlir/Silicon/TTNN/optimizer/all_l1_interleaved_policy.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir index 6fa884d79..11eb41da1 100644 --- a/test/ttmlir/Silicon/TTNN/optimizer/all_l1_interleaved_policy.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/all_l1_interleaved_policy.mlir @@ -1,30 +1,27 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s #any_device = #tt.operand_constraint module attributes {} { func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>, %arg2: tensor<64x96xbf16>, %arg3: tensor<96x32xbf16>, %arg4: tensor<64x32xbf16>) -> tensor<64x32xbf16> { // CHECK: #[[L1_:.*]] = #ttnn.buffer_type - // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved> // CHECK: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved> - // CHECK: #[[LAYOUT_8:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #dram>, interleaved> + // CHECK: #[[LAYOUT_10:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved> %0 = tensor.empty() : tensor<64x96xbf16> - // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_6]]> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_7]]> %1 = "ttir.matmul"(%arg0, %arg1, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<128x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16> %2 = tensor.empty() : tensor<64x96xbf16> - // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_6]]> + // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_7]]> %3 = "ttir.add"(%1, %arg2, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x96xbf16>, tensor<64x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16> %4 = tensor.empty() : tensor<64x96xbf16> - // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_6]]> + // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_7]]> %5 = "ttir.relu"(%3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16> %6 = tensor.empty() : tensor<64x32xbf16> - // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_7]]> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_10]]> %7 = "ttir.matmul"(%5, %arg3, %6) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x96xbf16>, tensor<96x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16> %8 = tensor.empty() : tensor<64x32xbf16> - // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_7]]> + // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_10]]> %9 = "ttir.add"(%7, %arg4, %8) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x32xbf16>, tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16> %10 = tensor.empty() : tensor<64x32xbf16> - // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_8]]> + // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_10]]> %11 = "ttir.relu"(%9, %10) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16> return %11 : tensor<64x32xbf16> } diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir new file mode 100644 index 000000000..fef8cdd48 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/fork_join.mlir @@ -0,0 +1,45 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// +// A +// | +// B +// / \ +// C D +// | | +// | E +// \ / +// F +// | +// G +// +// This tests two things: +// 1. Output of op B (fork op) should be in DRAM. +// 2. Even though both precedence [C, E] and [E, C] for op F are legal, +// the optimizer should choose the one with lower requiredL1Usage. In +// this case, [E, C] should be chosen. +// +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<64x64xbf16>, %arg1: tensor<64x32xbf16>) -> tensor<64x32xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x8xbf16, #dram>, interleaved> + // CHECK: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x4xbf16, #l1_>, interleaved> + // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <8x8>, memref<8x8xbf16, #l1_>, interleaved> + %0 = tensor.empty() : tensor<64x64xbf16> + // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x64xbf16, #[[LAYOUT_3]]> + %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16> + %2 = tensor.empty() : tensor<64x64xbf16> + %3 = "ttir.relu"(%1, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x64xbf16>, tensor<64x64xbf16>) -> tensor<64x64xbf16> + %4 = tensor.empty() : tensor<64x32xbf16> + %5 = "ttir.matmul"(%1, %arg1, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x64xbf16>, tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16> + %6 = tensor.empty() : tensor<64x32xbf16> + %7 = "ttir.relu"(%5, %6) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16> + %8 = tensor.empty() : tensor<64x32xbf16> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_5]]> + // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_5]]> + // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x64xbf16, #[[LAYOUT_6]]> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_5]]> + %9 = "ttir.matmul"(%3, %7, %8) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x64xbf16>, tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16> + return %9 : tensor<64x32xbf16> + } +} diff --git a/test/ttmlir/Silicon/TTNN/optimizer/mnist_l1_interleaved.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir similarity index 88% rename from test/ttmlir/Silicon/TTNN/optimizer/mnist_l1_interleaved.mlir rename to test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir index ef6fae268..93a19ad6e 100644 --- a/test/ttmlir/Silicon/TTNN/optimizer/mnist_l1_interleaved.mlir +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/mnist_l1_interleaved.mlir @@ -1,13 +1,11 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s #any_device = #tt.operand_constraint #loc = loc("MNISTLinear":4294967295:0) module @"tt-forge-graph" attributes {} { func.func @main(%arg0: tensor<1x784xf32> loc("MNISTLinear":4294967295:0), %arg1: tensor<1x10xf32> loc("MNISTLinear":4294967295:0), %arg2: tensor<256x10xf32> loc("MNISTLinear":4294967295:0), %arg3: tensor<1x256xf32> loc("MNISTLinear":4294967295:0), %arg4: tensor<784x256xf32> loc("MNISTLinear":4294967295:0)) -> tensor<1x10xf32> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type // CHECK: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved> // CHECK: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #l1_>, interleaved> - // CHECK: #[[LAYOUT_8:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<{{.*}}, #dram>, interleaved> %0 = tensor.empty() : tensor<1x256xf32> loc(#loc8) // CHECK: %[[C:.*]] = "ttnn.matmul"[[C:.*]] -> tensor<1x256xf32, #[[LAYOUT_6]]> %1 = "ttir.matmul"(%arg0, %arg4, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x784xf32>, tensor<784x256xf32>, tensor<1x256xf32>) -> tensor<1x256xf32> loc(#loc8) @@ -24,7 +22,7 @@ module @"tt-forge-graph" attributes {} { // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x10xf32, #[[LAYOUT_7]]> %9 = "ttir.add"(%7, %arg1, %8) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> loc(#loc12) %10 = tensor.empty() : tensor<1x10xf32> loc(#loc13) - // CHECK: %{{.*}} = "ttnn.softmax"{{.*}} -> tensor<1x10xf32, #[[LAYOUT_8]]> + // CHECK: %{{.*}} = "ttnn.softmax"{{.*}} -> tensor<1x10xf32, #[[LAYOUT_7]]> %11 = "ttir.softmax"(%9, %10) <{dimension = 1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> loc(#loc13) return %11 : tensor<1x10xf32> loc(#loc7) } loc(#loc) diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir new file mode 100644 index 000000000..acbb8d674 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_ABC_l1_None.mlir @@ -0,0 +1,28 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// +// A B +// \ / +// C +// | +// D +// +// (A > L1) AND (B > L1) AND (C > L1) +// => +// DRAM: ABC; L1: None +// +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8192xbf16>, %arg2: tensor<8192x8192xbf16>, %arg3: tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> { + // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x1024xbf16, #dram>, interleaved> + %0 = tensor.empty() : tensor<8192x8192xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> + %2 = tensor.empty() : tensor<8192x8192xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]> + %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> + %4 = tensor.empty() : tensor<8192x8192xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]> + %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> + return %5 : tensor<8192x8192xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir new file mode 100644 index 000000000..49aebb6a4 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AB_l1_C.mlir @@ -0,0 +1,31 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// +// A B +// \ / +// C +// | +// D +// +// (A + C > L1) AND (B + C > L1) AND (A + B > L1) AND (A < C) AND (B < C) AND (C <= L1) +// => +// DRAM: AB; L1: C +// +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<5120x4096xbf16>, %arg1: tensor<5120x4096xbf16>, %arg2: tensor<4096x5120xbf16>, %arg3: tensor<4096x5120xbf16>) -> tensor<5120x5120xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK-DAG: #[[LAYOUT_4:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<512x640xbf16, #dram>, interleaved> + // CHECK-DAG: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x512xbf16, #dram>, interleaved> + // CHECK-DAG: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, interleaved> + %0 = tensor.empty() : tensor<5120x4096xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x4096xbf16, #[[LAYOUT_6]]> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x4096xbf16>, tensor<5120x4096xbf16>, tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16> + %2 = tensor.empty() : tensor<4096x5120xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_4]]> + %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<4096x5120xbf16>, tensor<4096x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16> + %4 = tensor.empty() : tensor<5120x5120xbf16> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<5120x5120xbf16, #[[LAYOUT_7]]> + %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x4096xbf16>, tensor<4096x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16> + return %5 : tensor<5120x5120xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir new file mode 100644 index 000000000..7f41675cd --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_AC_l1_B.mlir @@ -0,0 +1,30 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// +// A B +// \ / +// C +// | +// D +// +// (A + C > L1) AND (B + C > L1) AND (A + B > L1) AND (A < B) AND (C < B) AND (B <= L1) +// => +// DRAM: AC; L1: B +// +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<4096x5120xbf16>, %arg1: tensor<4096x5120xbf16>, %arg2: tensor<5120x5120xbf16>, %arg3: tensor<5120x5120xbf16>) -> tensor<4096x5120xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<512x640xbf16, #dram>, interleaved> + // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, interleaved> + %0 = tensor.empty() : tensor<4096x5120xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_3]]> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<4096x5120xbf16>, tensor<4096x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16> + %2 = tensor.empty() : tensor<5120x5120xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x5120xbf16, #[[LAYOUT_5]]> + %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x5120xbf16>, tensor<5120x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16> + %4 = tensor.empty() : tensor<4096x5120xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<4096x5120xbf16, #[[LAYOUT_3]]> + %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<4096x5120xbf16>, tensor<5120x5120xbf16>, tensor<4096x5120xbf16>) -> tensor<4096x5120xbf16> + return %5 : tensor<4096x5120xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir new file mode 100644 index 000000000..7d4c923b4 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_A_l1_BC.mlir @@ -0,0 +1,30 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// +// A B +// \ / +// C +// | +// D +// +// (A + B + C > L1) AND (A + C < B + C) AND (A + B < B + C) AND (B + C <= L1) +// => +// DRAM: A; L1: BC +// +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<2048x2048xbf16>, %arg1: tensor<2048x2048xbf16>, %arg2: tensor<2048x8192xbf16>, %arg3: tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, interleaved> + // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x1024xbf16, #l1_>, interleaved> + %0 = tensor.empty() : tensor<2048x2048xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x2048xbf16, #[[LAYOUT_3]]> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x2048xbf16>, tensor<2048x2048xbf16>, tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16> + %2 = tensor.empty() : tensor<2048x8192xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x8192xbf16, #[[LAYOUT_5]]> + %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x8192xbf16>, tensor<2048x8192xbf16>, tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16> + %4 = tensor.empty() : tensor<2048x8192xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<2048x8192xbf16, #[[LAYOUT_5]]> + %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x2048xbf16>, tensor<2048x8192xbf16>, tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16> + return %5 : tensor<2048x8192xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir new file mode 100644 index 000000000..c915fadd1 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_BC_l1_A.mlir @@ -0,0 +1,30 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// +// A B +// \ / +// C +// | +// D +// +// (A + C > L1) AND (B + C > L1) AND (A + B > L1) AND (B < A) AND (C < A) AND (A <= L1) +// => +// DRAM: BC; L1: A +// +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<5120x5120xbf16>, %arg1: tensor<5120x5120xbf16>, %arg2: tensor<5120x4096xbf16>, %arg3: tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x512xbf16, #dram>, interleaved> + // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<640x640xbf16, #l1_>, interleaved> + %0 = tensor.empty() : tensor<5120x5120xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x5120xbf16, #[[LAYOUT_5]]> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x5120xbf16>, tensor<5120x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16> + %2 = tensor.empty() : tensor<5120x4096xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<5120x4096xbf16, #[[LAYOUT_3]]> + %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x4096xbf16>, tensor<5120x4096xbf16>, tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16> + %4 = tensor.empty() : tensor<5120x4096xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<5120x4096xbf16, #[[LAYOUT_3]]> + %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<5120x5120xbf16>, tensor<5120x4096xbf16>, tensor<5120x4096xbf16>) -> tensor<5120x4096xbf16> + return %5 : tensor<5120x4096xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir new file mode 100644 index 000000000..3d2538e24 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_B_l1_AC.mlir @@ -0,0 +1,30 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// +// A B +// \ / +// C +// | +// D +// +// (A + B + C > L1) AND (B + C < A + C) AND (A + B < A + C) AND (A + C <= L1) +// => +// DRAM: B; L1: AC +// +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<8192x2048xbf16>, %arg1: tensor<8192x2048xbf16>, %arg2: tensor<2048x2048xbf16>, %arg3: tensor<2048x2048xbf16>) -> tensor<8192x2048xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK-DAG: #[[LAYOUT_3:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, interleaved> + // CHECK-DAG: #[[LAYOUT_5:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x256xbf16, #l1_>, interleaved> + %0 = tensor.empty() : tensor<8192x2048xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x2048xbf16, #[[LAYOUT_5]]> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x2048xbf16>, tensor<8192x2048xbf16>, tensor<8192x2048xbf16>) -> tensor<8192x2048xbf16> + %2 = tensor.empty() : tensor<2048x2048xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x2048xbf16, #[[LAYOUT_3]]> + %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x2048xbf16>, tensor<2048x2048xbf16>, tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16> + %4 = tensor.empty() : tensor<8192x2048xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<8192x2048xbf16, #[[LAYOUT_5]]> + %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x2048xbf16>, tensor<2048x2048xbf16>, tensor<8192x2048xbf16>) -> tensor<8192x2048xbf16> + return %5 : tensor<8192x2048xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir new file mode 100644 index 000000000..320f00ce3 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_C_l1_AB.mlir @@ -0,0 +1,31 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// +// A B +// \ / +// C +// | +// D +// +// (A + B + C > L1) AND (A + C < A + B) AND (B + C < A + B) AND (A + B <= L1) +// => +// DRAM: C; L1: AB +// +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<2048x8192xbf16>, %arg1: tensor<2048x8192xbf16>, %arg2: tensor<8192x2048xbf16>, %arg3: tensor<8192x2048xbf16>) -> tensor<2048x2048xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK-DAG: #[[LAYOUT_4:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x1024xbf16, #l1_>, interleaved> + // CHECK-DAG: #[[LAYOUT_6:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<1024x256xbf16, #l1_>, interleaved> + // CHECK-DAG: #[[LAYOUT_7:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<256x256xbf16, #dram>, interleaved> + %0 = tensor.empty() : tensor<2048x8192xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<2048x8192xbf16, #[[LAYOUT_4]]> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x8192xbf16>, tensor<2048x8192xbf16>, tensor<2048x8192xbf16>) -> tensor<2048x8192xbf16> + %2 = tensor.empty() : tensor<8192x2048xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x2048xbf16, #[[LAYOUT_6]]> + %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x2048xbf16>, tensor<8192x2048xbf16>, tensor<8192x2048xbf16>) -> tensor<8192x2048xbf16> + %4 = tensor.empty() : tensor<2048x2048xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<2048x2048xbf16, #[[LAYOUT_7]]> + %5 = "ttir.matmul"(%1, %3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<2048x8192xbf16>, tensor<8192x2048xbf16>, tensor<2048x2048xbf16>) -> tensor<2048x2048xbf16> + return %5 : tensor<2048x2048xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir new file mode 100644 index 000000000..a21a11f87 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/simple_join_tests/dram_None_l1_ABC.mlir @@ -0,0 +1,29 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// +// A B +// \ / +// C +// | +// D +// +// (A + B + C <= L1) +// => +// DRAM: None; L1: ABC +// +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<32x32xbf16>, %arg1: tensor<32x32xbf16>, %arg2: tensor<32x32xbf16>, %arg3: tensor<32x32xbf16>) -> tensor<32x32xbf16> { + // CHECK: #[[L1_:.*]] = #ttnn.buffer_type + // CHECK-DAG: #[[LAYOUT_2:.*]] = #ttnn.ttnn_layout<(d0, d1) -> (d0, d1), <{{.*}}>, memref<4x4xbf16, #l1_>, interleaved> + %0 = tensor.empty() : tensor<32x32xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<32x32xbf16, #[[LAYOUT_2]]> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16> + %2 = tensor.empty() : tensor<32x32xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<32x32xbf16, #[[LAYOUT_2]]> + %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16> + %4 = tensor.empty() : tensor<32x32xbf16> + // CHECK-DAG: %{{.*}} = "ttnn.add"{{.*}} -> tensor<32x32xbf16, #[[LAYOUT_2]]> + %5 = "ttir.add"(%1, %3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<32x32xbf16>, tensor<32x32xbf16>, tensor<32x32xbf16>) -> tensor<32x32xbf16> + return %5 : tensor<32x32xbf16> + } +} diff --git a/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/single_op.mlir b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/single_op.mlir new file mode 100644 index 000000000..482079993 --- /dev/null +++ b/test/ttmlir/Dialect/TTNN/optimizer/l1_interleaved_policy/single_op.mlir @@ -0,0 +1,10 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s | FileCheck %s +// UNSUPPORTED: true +#any_device_tile = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16> { + %0 = tensor.empty() : tensor<5120x5120xbf16> + %1 = "ttir.relu"(%arg0, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device_tile, #any_device_tile]}> : (tensor<5120x5120xbf16>, tensor<5120x5120xbf16>) -> tensor<5120x5120xbf16> + return %1 : tensor<5120x5120xbf16> + } +} diff --git a/test/ttmlir/Silicon/TTNN/optimizer/large_tensors.mlir b/test/ttmlir/Silicon/TTNN/optimizer/large_tensors.mlir deleted file mode 100644 index fb71dae8d..000000000 --- a/test/ttmlir/Silicon/TTNN/optimizer/large_tensors.mlir +++ /dev/null @@ -1,19 +0,0 @@ -// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir -// RUN: FileCheck %s --input-file=%t.mlir -// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn -#any_device = #tt.operand_constraint -module attributes {} { - func.func @forward(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8192xbf16>, %arg2: tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> { - // CHECK: #[[LAYOUT_2:ttnn_layout2]] = #ttnn.ttnn_layout<{{.*}}, memref<{{.*}}, #dram>, {{.*}}> - %0 = tensor.empty() : tensor<8192x8192xbf16> - // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]> - %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> - %2 = tensor.empty() : tensor<8192x8192xbf16> - // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]> - %3 = "ttir.add"(%1, %arg2, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> - %4 = tensor.empty() : tensor<8192x8192xbf16> - // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]> - %7 = "ttir.relu"(%3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> - return %7 : tensor<8192x8192xbf16> - } -} diff --git a/test/unittests/Optimizer/CMakeLists.txt b/test/unittests/Optimizer/CMakeLists.txt index 4e6ee799a..b05c8ae29 100644 --- a/test/unittests/Optimizer/CMakeLists.txt +++ b/test/unittests/Optimizer/CMakeLists.txt @@ -1,11 +1,13 @@ add_mlir_unittest(OptimizerTests TestShardSolver.cpp TestOptimizerOverrides.cpp + TestL1InterleavedPolicy.cpp ) target_link_libraries(OptimizerTests PRIVATE MLIR MLIRTTDialect + MLIRTTNNAnalysis MLIRTTNNPipelines ) diff --git a/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp b/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp new file mode 100644 index 000000000..7d02cef56 --- /dev/null +++ b/test/unittests/Optimizer/TestL1InterleavedPolicy.cpp @@ -0,0 +1,193 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "mlir/IR/Value.h" +#include "mlir/IR/ValueRange.h" +#include "llvm/ADT/SmallVector.h" + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/MLIRContext.h" + +#include "ttmlir/Dialect/TTNN/IR/TTNN.h" +#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h" + +#include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h" + +using namespace mlir::tt::ttnn; + +constexpr int TensorDimX = 128; +constexpr int TensorDimY = 128; + +class L1InterleavedPolicyBase : public ::testing::Test { +public: + mlir::MLIRContext context; + mlir::OwningOpRef module; + mlir::OpBuilder builder = mlir::OpBuilder(&context); + mlir::func::FuncOp func; + mlir::tt::DeviceAttr deviceAttr; + + using OpMemSpec = L1InterleavedPolicy::OpMemSpec; + using OpConfig = L1InterleavedPolicy::OpConfig; + using L1Usage = L1InterleavedPolicy::L1Usage; + + void SetUp() override { + context.loadDialect(); + module = mlir::ModuleOp::create(builder.getUnknownLoc()); + builder.setInsertionPointToStart(&module->getBodyRegion().front()); + createFuncOp(); + deviceAttr = mlir::tt::getCurrentScopeDevice(func); + } + + llvm::SmallVector getTensorShape() { + return {TensorDimX, TensorDimY}; + } + + mlir::RankedTensorType getTensorRankedType() { + return mlir::RankedTensorType::get(getTensorShape(), builder.getF32Type()); + } + + mlir::Value createEmptyTensor() { + ShapeAttr shapeAttr = ShapeAttr::get(&context, getTensorShape()); + return builder.create(builder.getUnknownLoc(), + getTensorRankedType(), nullptr, shapeAttr, + nullptr, nullptr, nullptr); + } + + mlir::func::FuncOp createFuncOp() { + mlir::SmallVector input; + input.push_back(getTensorRankedType()); + + mlir::SmallVector output; + output.push_back(getTensorRankedType()); + + auto funcType = builder.getType( + mlir::TypeRange(input), mlir::TypeRange(output)); + func = builder.create(builder.getUnknownLoc(), "test", + funcType); + + mlir::Block *block = func.addEntryBlock(); + block->addArgument(getTensorRankedType(), builder.getUnknownLoc()); + block->addArgument(getTensorRankedType(), builder.getUnknownLoc()); + + builder.setInsertionPointToStart(block); + + return func; + } + + void addLayoutForOp(mlir::Operation *op, + llvm::DenseMap> &legalLayouts, + BufferType memorySpace, + TensorMemoryLayout tensorMemoryLayout) { + if (legalLayouts.find(op) == legalLayouts.end()) { + legalLayouts[op] = std::vector{TTNNLayoutAttr::get( + &context, getTensorRankedType().getShape(), builder.getF32Type(), + memorySpace, mlir::tt::GridAttr::get(&context, {8, 8}), + tensorMemoryLayout)}; + } else { + legalLayouts[op].push_back(TTNNLayoutAttr::get( + &context, getTensorRankedType().getShape(), builder.getF32Type(), + memorySpace, mlir::tt::GridAttr::get(&context, {8, 8}), + tensorMemoryLayout)); + } + } + + void prepareOpForGreedyConfigPicker( + mlir::Operation *op, uint64_t outputL1Usage, uint64_t requiredL1Usage, + llvm::DenseMap> + &legalLayouts, + llvm::DenseMap &opsL1Usage) { + + // Add two legal layouts for the op with different buffer + // types: DRAM and L1. + addLayoutForOp(op, legalLayouts, BufferType::DRAM, + TensorMemoryLayout::Interleaved); + addLayoutForOp(op, legalLayouts, BufferType::L1, + TensorMemoryLayout::Interleaved); + + L1Usage l1Usage; + l1Usage.outputL1Usage = outputL1Usage; + l1Usage.requiredL1Usage = requiredL1Usage; + opsL1Usage[op] = l1Usage; + } + + void TearDown() override {} +}; + +TEST_F(L1InterleavedPolicyBase, VerifyGreedyPolicy) { + std::vector l1ChainConfigs; + llvm::DenseMap> legalLayouts; + llvm::DenseMap> + schedule; + llvm::DenseMap opsL1Usage; + constexpr uint64_t usableL1CacheSize = 15; + + // Create operand A + mlir::Value dest = createEmptyTensor(); + mlir::Value lhs = func.getBody().getBlocks().front().getArgument(0); + mlir::Value rhs = func.getBody().getBlocks().front().getArgument(1); + mlir::Operation *opA = + builder.create(builder.getUnknownLoc(), lhs, rhs, dest); + uint64_t outputL1Usage = 2; + uint64_t requiredL1Usage = 8; + prepareOpForGreedyConfigPicker(opA, outputL1Usage, requiredL1Usage, + legalLayouts, opsL1Usage); + + // Create operand B + dest = createEmptyTensor(); + lhs = func.getBody().getBlocks().front().getArgument(0); + rhs = func.getBody().getBlocks().front().getArgument(1); + mlir::Operation *opB = + builder.create(builder.getUnknownLoc(), lhs, rhs, dest); + outputL1Usage = 3; + requiredL1Usage = 7; + prepareOpForGreedyConfigPicker(opB, outputL1Usage, requiredL1Usage, + legalLayouts, opsL1Usage); + + // Create operand C + dest = createEmptyTensor(); + lhs = func.getBody().getBlocks().front().getArgument(0); + rhs = func.getBody().getBlocks().front().getArgument(1); + mlir::Operation *opC = + builder.create(builder.getUnknownLoc(), lhs, rhs, dest); + outputL1Usage = 1; + requiredL1Usage = 9; + prepareOpForGreedyConfigPicker(opC, outputL1Usage, requiredL1Usage, + legalLayouts, opsL1Usage); + + // Create base op D + dest = createEmptyTensor(); + lhs = func.getBody().getBlocks().front().getArgument(0); + rhs = func.getBody().getBlocks().front().getArgument(1); + mlir::Operation *opD = + builder.create(builder.getUnknownLoc(), lhs, rhs, dest); + outputL1Usage = 4; + requiredL1Usage = 0; + prepareOpForGreedyConfigPicker(opD, outputL1Usage, requiredL1Usage, + legalLayouts, opsL1Usage); + + // Run greedy config picker policy + L1InterleavedPolicy l1InterleavedPolicy(nullptr, l1ChainConfigs, legalLayouts, + schedule, usableL1CacheSize); + OpConfig greedyConfig = l1InterleavedPolicy.getGreedyConfig(opD, opsL1Usage); + + // Sanity checks + ASSERT_TRUE(greedyConfig.baseOp == opD); + ASSERT_TRUE(greedyConfig.layouts.size() == 4); + ASSERT_TRUE(greedyConfig.precedence.size() == 3); + + // All layouts should be using L1 buffer type + for (const auto &[op, layout] : greedyConfig.layouts) { + ASSERT_TRUE(layout.hasL1BufferType()); + } + + // Precedence order for op D should be: C, A, B + ASSERT_EQ(greedyConfig.precedence[0], opC); + ASSERT_EQ(greedyConfig.precedence[1], opA); + ASSERT_EQ(greedyConfig.precedence[2], opB); +}