diff --git a/.gitignore b/.gitignore index 30672f9bf..8663a2ff0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ third_party/tt-metal .cache *pycache* *.egg-info +ttrt-artifacts/* +query_results.json +run_results.json +ttrt_report.xml diff --git a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td index 04f2b64af..4ad64de6f 100644 --- a/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td +++ b/include/ttmlir/Dialect/TT/IR/TTOpsTypes.td @@ -288,7 +288,9 @@ def TT_LayoutAttr : TT_Attr<"Layout", "layout"> { bool isSystemMemorySpace() const { return ::mlir::tt::isSystemMemorySpace(getMemorySpace()); } bool isDeviceMemorySpace() const { return ::mlir::tt::isDeviceMemorySpace(getMemorySpace()); } bool hasShardedTensorMemoryLayout() const; + bool hasInterleavedTensorMemoryLayout() const; bool hasShardedL1TensorMemoryLayout() const; + bool hasInterleavedL1TensorMemoryLayout() const; bool isTiled() const; Type getElementType() const; Type getScalarElementType() const; diff --git a/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h b/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h new file mode 100644 index 000000000..de6b591d1 --- /dev/null +++ b/include/ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_DIALECT_TT_UTILS_MEMORYLAYOUTANALYSIS_H +#define TTMLIR_DIALECT_TT_UTILS_MEMORYLAYOUTANALYSIS_H + +#include +#include + +namespace mlir::tt { + +enum class MemoryLayoutAnalysisPolicyType { DFSharding, L1Interleaved }; + +struct MemoryLayoutAnalysisPolicyTypeParser + : public llvm::cl::parser { +public: + MemoryLayoutAnalysisPolicyTypeParser(llvm::cl::Option &opt) + : llvm::cl::parser(opt) {} + + bool parse(llvm::cl::Option &opt, llvm::StringRef argName, + llvm::StringRef arg, MemoryLayoutAnalysisPolicyType &value) { + value = llvm::StringSwitch(arg) + .Case("DFSharding", MemoryLayoutAnalysisPolicyType::DFSharding) + .Case("L1Interleaved", + MemoryLayoutAnalysisPolicyType::L1Interleaved); + return false; + } + + static void print(llvm::raw_ostream &os, + const MemoryLayoutAnalysisPolicyType &value) { + llvm::StringRef policy; + switch (value) { + case MemoryLayoutAnalysisPolicyType::DFSharding: + policy = "DFSharding"; + break; + case MemoryLayoutAnalysisPolicyType::L1Interleaved: + policy = "L1Interleaved"; + break; + } + os << "memory-layout-analysis-policy=" << policy << "\n"; + } +}; + +} // namespace mlir::tt + +#endif // TTMLIR_DIALECT_TT_UTILS_MEMORYLAYOUTANALYSIS_H diff --git a/include/ttmlir/Dialect/TT/Utils/OverrideParams.h b/include/ttmlir/Dialect/TT/Utils/OverrideParams.h index ed7967c07..56cde1c07 100644 --- a/include/ttmlir/Dialect/TT/Utils/OverrideParams.h +++ b/include/ttmlir/Dialect/TT/Utils/OverrideParams.h @@ -6,7 +6,6 @@ #define TTMLIR_DIALECT_TT_UTILS_OVERRIDEPARAMS_H #include "ttmlir/Dialect/TT/IR/TTOpsTypes.h" -#include #include namespace mlir::tt { diff --git a/include/ttmlir/Dialect/TTNN/Analysis/DFShardingPolicy.h b/include/ttmlir/Dialect/TTNN/Analysis/DFShardingPolicy.h index 6ef8476b0..6223ad429 100644 --- a/include/ttmlir/Dialect/TTNN/Analysis/DFShardingPolicy.h +++ b/include/ttmlir/Dialect/TTNN/Analysis/DFShardingPolicy.h @@ -7,19 +7,16 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h" +#include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h" namespace mlir::tt::ttnn { // Process ops in DFS schedulable order and build shard chain configs. // Schedule is also produced as a side effect of sharding. // -class DFShardingPolicy { +class DFShardingPolicy : public MemoryLayoutAnalysisPolicy { private: - Operation *rootOp; - std::vector *l1ChainConfigs; - llvm::DenseMap> legalLayouts; - llvm::DenseMap> *schedule; - unsigned usableL1CacheSize = 0; + std::unordered_set overrideReshardEdges; public: DFShardingPolicy( @@ -28,11 +25,15 @@ class DFShardingPolicy { &legalLayouts, llvm::DenseMap> &schedule, unsigned usableL1CacheSize) - : rootOp(rootOp), l1ChainConfigs(&l1ChainConfigs), - legalLayouts(legalLayouts), schedule(&schedule), - usableL1CacheSize(usableL1CacheSize) {} + : MemoryLayoutAnalysisPolicy(rootOp, l1ChainConfigs, legalLayouts, + schedule, usableL1CacheSize), + overrideReshardEdges() {} - void run(const std::unordered_set &overrideReshardEdges); + void run() final; + + void setOverrideReshardEdges(const std::unordered_set &reshardEdges) { + overrideReshardEdges = reshardEdges; + } }; } // namespace mlir::tt::ttnn diff --git a/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h b/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h new file mode 100644 index 000000000..753c07d80 --- /dev/null +++ b/include/ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_DIALECT_TTNN_ANALYSIS_L1INTERLEAVEDPOLICY_H +#define TTMLIR_DIALECT_TTNN_ANALYSIS_L1INTERLEAVEDPOLICY_H + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h" +#include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h" + +namespace mlir::tt::ttnn { + +class L1InterleavedPolicy : public MemoryLayoutAnalysisPolicy { +public: + L1InterleavedPolicy( + Operation *rootOp, std::vector &l1ChainConfigs, + const llvm::DenseMap> + &legalLayouts, + llvm::DenseMap> &schedule, + unsigned usableL1CacheSize) + : MemoryLayoutAnalysisPolicy(rootOp, l1ChainConfigs, legalLayouts, + schedule, usableL1CacheSize) {} + + void run() final; +}; + +} // namespace mlir::tt::ttnn + +#endif // TTMLIR_DIALECT_TTNN_ANALYSIS_L1INTERLEAVEDPOLICY_H diff --git a/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h b/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h index b7d864b72..39d059555 100644 --- a/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h +++ b/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h @@ -6,20 +6,18 @@ #define TTMLIR_DIALECT_TTNN_ANALYSIS_MEMORYLAYOUTANALYSIS_H #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h" #include "ttmlir/Dialect/TTNN/Analysis/Edge.h" #include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h" #include "ttmlir/Dialect/TTNN/Analysis/TTNNAnalysis.h" namespace mlir::tt::ttnn { -enum class MemoryLayoutAnalysisPolicyType { - DFSharding, -}; - struct MemoryLayoutAnalysisInput { llvm::DenseMap> legalLayouts; unsigned usableL1CacheSize = 0; std::unordered_set overrideReshardEdges; + MemoryLayoutAnalysisPolicyType policy; MemoryLayoutAnalysisInput() : legalLayouts() {} @@ -27,9 +25,10 @@ struct MemoryLayoutAnalysisInput { const llvm::DenseMap> &legalLayouts, unsigned usableL1CacheSize, - const std::unordered_set &overrideReshardEdges) + const std::unordered_set &overrideReshardEdges, + MemoryLayoutAnalysisPolicyType policy) : legalLayouts(legalLayouts), usableL1CacheSize(usableL1CacheSize), - overrideReshardEdges(overrideReshardEdges) {} + overrideReshardEdges(overrideReshardEdges), policy(policy) {} bool operator==(const MemoryLayoutAnalysisInput &rhs) const { return legalLayouts == rhs.legalLayouts; diff --git a/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h b/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h new file mode 100644 index 000000000..aecd9c6a4 --- /dev/null +++ b/include/ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysisPolicy.h @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef TTMLIR_DIALECT_TTNN_ANALYSIS_MEMORYLAYOUTANALYSISPOLICY_H +#define TTMLIR_DIALECT_TTNN_ANALYSIS_MEMORYLAYOUTANALYSISPOLICY_H + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "ttmlir/Dialect/TTNN/Analysis/L1ChainConfig.h" + +namespace mlir::tt::ttnn { + +class MemoryLayoutAnalysisPolicy { +protected: + Operation *rootOp; + std::vector *l1ChainConfigs; + llvm::DenseMap> legalLayouts; + llvm::DenseMap> *schedule; + unsigned usableL1CacheSize = 0; + +public: + virtual ~MemoryLayoutAnalysisPolicy() {}; + + MemoryLayoutAnalysisPolicy( + Operation *rootOp, std::vector &l1ChainConfigs, + const llvm::DenseMap> + &legalLayouts, + llvm::DenseMap> &schedule, + unsigned usableL1CacheSize) + : rootOp(rootOp), l1ChainConfigs(&l1ChainConfigs), + legalLayouts(legalLayouts), schedule(&schedule), + usableL1CacheSize(usableL1CacheSize) {} + + virtual void run() = 0; +}; + +} // namespace mlir::tt::ttnn + +#endif // TTMLIR_DIALECT_TTNN_ANALYSIS_MEMORYLAYOUTANALYSISPOLICY_H diff --git a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h index 9988bbcc1..a42ec0ea8 100644 --- a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h +++ b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h @@ -6,13 +6,11 @@ #define TTMLIR_DIALECT_TTNN_PIPELINES_TTNNPIPELINES_H #include "mlir/Pass/PassOptions.h" +#include "ttmlir/Dialect/TT/Utils/MemoryLayoutAnalysisParams.h" #include "ttmlir/Dialect/TT/Utils/OverrideParams.h" -#include -#include -#include -#include namespace mlir::tt::ttnn { + // Options for the TTIR to TTNN backend pipeline. // struct TTIRToTTNNBackendPipelineOptions @@ -85,6 +83,14 @@ struct TTIRToTTNNBackendPipelineOptions "of shard specs."), llvm::cl::init(false)}; + // Specify policy for memory layout analysis. + // + Option + memoryLayoutAnalysisPolicy{ + *this, "memory-layout-analysis-policy", + llvm::cl::desc("Specify policy for memory layout analysis."), + llvm::cl::init(MemoryLayoutAnalysisPolicyType::DFSharding)}; + // Option to provide a system descriptor flatbuffer file to compile // against. // diff --git a/include/ttmlir/Dialect/TTNN/Transforms/Optimizer.h b/include/ttmlir/Dialect/TTNN/Transforms/Optimizer.h index 064495f31..06074a0a3 100644 --- a/include/ttmlir/Dialect/TTNN/Transforms/Optimizer.h +++ b/include/ttmlir/Dialect/TTNN/Transforms/Optimizer.h @@ -19,6 +19,8 @@ struct TTNNOptimizerOptions { llvm::StringMap overrideOutputLayout = llvm::StringMap(); bool memoryLayoutAnalysisEnabled = false; + MemoryLayoutAnalysisPolicyType memoryLayoutAnalysisPolicy = + MemoryLayoutAnalysisPolicyType::DFSharding; bool memReconfigEnabled = false; int64_t maxLegalLayouts = 64; }; @@ -95,6 +97,7 @@ class TTNNOptimizerBase : public ::mlir::OperationPass<::mlir::ModuleOp> { memoryLayoutAnalysisEnabled = std::move(options.memoryLayoutAnalysisEnabled); memReconfigEnabled = std::move(options.memReconfigEnabled); + memoryLayoutAnalysisPolicy = std::move(options.memoryLayoutAnalysisPolicy); maxLegalLayouts = std::move(options.maxLegalLayouts); } @@ -122,6 +125,12 @@ class TTNNOptimizerBase : public ::mlir::OperationPass<::mlir::ModuleOp> { "we support all " "types of shard specs."), ::llvm::cl::init(false)}; + ::mlir::Pass::Option + memoryLayoutAnalysisPolicy{ + *this, "memory-layout-analysis-policy", + llvm::cl::desc("Specify policy for memory layout analysis."), + llvm::cl::init(MemoryLayoutAnalysisPolicyType::DFSharding)}; ::mlir::Pass::Option maxLegalLayouts{ *this, "max-legal-layouts", ::llvm::cl::desc( diff --git a/include/ttmlir/Dialect/TTNN/Transforms/Passes.h b/include/ttmlir/Dialect/TTNN/Transforms/Passes.h index 8e79db96a..fa05f41de 100644 --- a/include/ttmlir/Dialect/TTNN/Transforms/Passes.h +++ b/include/ttmlir/Dialect/TTNN/Transforms/Passes.h @@ -9,6 +9,7 @@ #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "ttmlir/Dialect/TT/Utils/OverrideParams.h" +#include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h" #include "ttmlir/Dialect/TTNN/IR/TTNN.h" #include "ttmlir/Dialect/TTNN/IR/TTNNOps.h" diff --git a/lib/Dialect/TT/IR/TTOpsTypes.cpp b/lib/Dialect/TT/IR/TTOpsTypes.cpp index 6a20b2ba7..3f541a39d 100644 --- a/lib/Dialect/TT/IR/TTOpsTypes.cpp +++ b/lib/Dialect/TT/IR/TTOpsTypes.cpp @@ -579,6 +579,10 @@ bool LayoutAttr::hasShardedTensorMemoryLayout() const { getMemLayout() == TensorMemoryLayout::BlockSharded); } +bool LayoutAttr::hasInterleavedTensorMemoryLayout() const { + return (getMemLayout() == TensorMemoryLayout::Interleaved); +} + bool LayoutAttr::hasShardedL1TensorMemoryLayout() const { return ::mlir::tt::isL1MemorySpace(getMemorySpace()) and (getMemLayout() == TensorMemoryLayout::HeightSharded or @@ -586,6 +590,11 @@ bool LayoutAttr::hasShardedL1TensorMemoryLayout() const { getMemLayout() == TensorMemoryLayout::BlockSharded); } +bool LayoutAttr::hasInterleavedL1TensorMemoryLayout() const { + return ::mlir::tt::isL1MemorySpace(getMemorySpace()) and + (getMemLayout() == TensorMemoryLayout::Interleaved); +} + bool LayoutAttr::isTiled() const { return ::mlir::isa<::mlir::tt::TileType>(getElementType()); } diff --git a/lib/Dialect/TTNN/Analysis/CMakeLists.txt b/lib/Dialect/TTNN/Analysis/CMakeLists.txt index 25257db36..996064d79 100644 --- a/lib/Dialect/TTNN/Analysis/CMakeLists.txt +++ b/lib/Dialect/TTNN/Analysis/CMakeLists.txt @@ -4,6 +4,7 @@ add_mlir_dialect_library(MLIRTTNNAnalysis MemoryLayoutAnalysis.cpp L1ChainConfig.cpp DFShardingPolicy.cpp + L1InterleavedPolicy.cpp ShardSolver.cpp ADDITIONAL_HEADER_DIRS diff --git a/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp b/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp index 7a7470ad3..f5c93ddbf 100644 --- a/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp +++ b/lib/Dialect/TTNN/Analysis/DFShardingPolicy.cpp @@ -8,8 +8,7 @@ namespace mlir::tt::ttnn { -void DFShardingPolicy::run( - const std::unordered_set &overrideReshardEdges) { +void DFShardingPolicy::run() { rootOp->walk([&](func::FuncOp func) { DeviceAttr deviceAttr = getCurrentScopeDevice(func); mlir::tt::scheduler::Scheduler scheduler(&func); diff --git a/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp b/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp new file mode 100644 index 000000000..bad37d94f --- /dev/null +++ b/lib/Dialect/TTNN/Analysis/L1InterleavedPolicy.cpp @@ -0,0 +1,155 @@ +// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h" +#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h" +#include "ttmlir/Dialect/TTNN/IR/TTNNOps.h" +#include "ttmlir/Scheduler/Scheduler.h" + +namespace mlir::tt::ttnn { + +uint64_t getOpOutputLayoutUsage( + Operation *op, + llvm::DenseMap> &legalLayouts, + DeviceAttr &deviceAttr) { + tt::LayoutAttr opLayout = legalLayouts.lookup(op).front(); + assert(opLayout.hasInterleavedL1TensorMemoryLayout()); + + llvm::ArrayRef opOutputTensorShape = + mlir::cast(op->getResult(0).getType()).getShape(); + + uint64_t opL1OutputUsage = deviceAttr.getLayoutSizeBytes( + opOutputTensorShape, opLayout, opLayout.getMemorySpace()); + return opL1OutputUsage; +} + +void L1InterleavedPolicy::run() { + rootOp->walk([&](func::FuncOp func) { + DeviceAttr deviceAttr = getCurrentScopeDevice(func); + mlir::tt::scheduler::Scheduler scheduler(&func); + llvm::SmallVector scheduleableOps; + llvm::DenseMap selectedOpLayout; + Operation *currentOp = nullptr; + + // TODO(fbajraktari): Add algorithm description. Currently, the algorithm + // is the same as for DFSharding policy, but works only for L1 interleaved. + // + l1ChainConfigs->push_back(L1ChainConfig()); + while (scheduler.hasUnscheduledOps()) { + scheduleableOps = scheduler.getScheduleableOps(); + + // Before starting a l1 chain, schedule layout/memory management ops + // first until they are exhausted from schedulable ops. + // + if (l1ChainConfigs->back().isEmpty()) { + for (auto *op : scheduleableOps) { + if (isa(op)) { + currentOp = op; + break; + } + } + } + + if (currentOp == nullptr) { + currentOp = scheduleableOps[0]; + } + + // Schedule currentOp. + // + scheduler.scheduleOp(currentOp); + + // Skip starting sharding chain if currentOp is a memory management op. + // + if (l1ChainConfigs->back().isEmpty() && isa(currentOp)) { + currentOp = nullptr; + continue; + } + + if (scheduler.hasUnscheduledOps()) { + scheduleableOps = scheduler.getScheduleableOps(); + + // Check if currentOp has a valid successor. + // + Operation *nextOp = nullptr; + for (auto *op : scheduleableOps) { + for (auto operand : op->getOperands()) { + if (operand.getDefiningOp() == currentOp) { + nextOp = op; + break; + } + } + } + + if (nextOp) { + + // V1: Check that currentOp is not fork/join op. + // + bool validForL1Interleaved = + currentOp->hasOneUse() && + legalLayouts.lookup(currentOp).size() > 0 && + legalLayouts.lookup(nextOp).size() > 0; + + if (validForL1Interleaved) { + // Figure out this const based on exec data, but will be replaced + // with API. + // + constexpr float tensorL1UsageCap = 0.8; + uint64_t currentOpL1OutputUsage = + getOpOutputLayoutUsage(currentOp, legalLayouts, deviceAttr); + uint64_t nextOpL1OutputUsage = + getOpOutputLayoutUsage(nextOp, legalLayouts, deviceAttr); + bool l1UsageValid = (currentOpL1OutputUsage + nextOpL1OutputUsage) < + tensorL1UsageCap * usableL1CacheSize; + + if (l1UsageValid) { + selectedOpLayout[currentOp] = + legalLayouts.lookup(currentOp).front(); + + // Add currentOp to l1 chain config. + // + OpL1MemSpec shardSpec; + shardSpec.op = currentOp; + + // Hardcoded tensor split factor for now, until pipeline OP + // support is added. + // + shardSpec.tensorSplitFactor = 1; + l1ChainConfigs->back().addOpL1MemSpec(std::move(shardSpec)); + + // Update currentOp pointer. + // + currentOp = nextOp; + continue; + } + } + } + + currentOp = nullptr; + if (!l1ChainConfigs->back().isEmpty()) { + l1ChainConfigs->back().build(); + l1ChainConfigs->push_back(L1ChainConfig()); + } + } + } + + if (l1ChainConfigs->back().isEmpty()) { + l1ChainConfigs->pop_back(); + } + + // Schedule + // + (*schedule)[func] = scheduler.getSchedule(); + + // Resolve l1 chain configs. + // + for (auto &l1ChainConfig : *l1ChainConfigs) { + l1ChainConfig.resolve(); + + std::unordered_set memReconfigEdges; + l1ChainConfig.complete(selectedOpLayout, memReconfigEdges); + } + }); +} + +} // namespace mlir::tt::ttnn diff --git a/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp b/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp index e630782f4..302334d1b 100644 --- a/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp +++ b/lib/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.cpp @@ -4,6 +4,7 @@ #include "ttmlir/Dialect/TTNN/Analysis/MemoryLayoutAnalysis.h" #include "ttmlir/Dialect/TTNN/Analysis/DFShardingPolicy.h" +#include "ttmlir/Dialect/TTNN/Analysis/L1InterleavedPolicy.h" namespace mlir::tt::ttnn { @@ -33,18 +34,46 @@ filterShardedOnly(const llvm::DenseMap> return shardedLayouts; } -void MemoryLayoutAnalysis::analysisImplementation() { - MemoryLayoutAnalysisPolicyType policy = - MemoryLayoutAnalysisPolicyType::DFSharding; +llvm::DenseMap> +filterL1InterleavedOnly( + const llvm::DenseMap> + &legalLayouts) { + llvm::DenseMap> l1InterleavedLayouts; + for (const auto &opLayouts : legalLayouts) { + std::vector opL1InterleavedLayouts; + for (const auto &layout : opLayouts.second) { + if (layout.hasInterleavedL1TensorMemoryLayout()) { + opL1InterleavedLayouts.push_back(layout); + } + } + + l1InterleavedLayouts[opLayouts.first] = opL1InterleavedLayouts; + } - switch (policy) { - case MemoryLayoutAnalysisPolicyType::DFSharding: + return l1InterleavedLayouts; +} + +void MemoryLayoutAnalysis::analysisImplementation() { + // Apply specific memory layout analysis policy. + // + switch (analysisInput.policy) { + case MemoryLayoutAnalysisPolicyType::DFSharding: { DFShardingPolicy dfShardingPolicy( op, l1ChainConfigs, filterShardedOnly(analysisInput.legalLayouts), analysisResult.schedule, analysisInput.usableL1CacheSize); - dfShardingPolicy.run(analysisInput.overrideReshardEdges); + dfShardingPolicy.setOverrideReshardEdges( + analysisInput.overrideReshardEdges); + dfShardingPolicy.run(); break; } + case MemoryLayoutAnalysisPolicyType::L1Interleaved: { + L1InterleavedPolicy l1InterleavedPolicy( + op, l1ChainConfigs, filterL1InterleavedOnly(analysisInput.legalLayouts), + analysisResult.schedule, analysisInput.usableL1CacheSize); + l1InterleavedPolicy.run(); + break; + } + } // Copy over default legal layouts. // diff --git a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp index 7f3baaeaf..772b51b04 100644 --- a/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp +++ b/lib/Dialect/TTNN/Pipelines/TTNNPipelines.cpp @@ -51,6 +51,8 @@ void createTTNNPipelineAnalysisPasses( optimizerOptions.memoryLayoutAnalysisEnabled = options.memoryLayoutAnalysisEnabled; optimizerOptions.memReconfigEnabled = options.memReconfigEnabled; + optimizerOptions.memoryLayoutAnalysisPolicy = + options.memoryLayoutAnalysisPolicy; optimizerOptions.maxLegalLayouts = options.maxLegalLayouts; pm.addPass(mlir::tt::ttnn::createTTNNOptimizer(optimizerOptions)); } diff --git a/lib/Dialect/TTNN/Transforms/Optimizer.cpp b/lib/Dialect/TTNN/Transforms/Optimizer.cpp index 37b4cfe64..2af08e9c9 100644 --- a/lib/Dialect/TTNN/Transforms/Optimizer.cpp +++ b/lib/Dialect/TTNN/Transforms/Optimizer.cpp @@ -71,7 +71,8 @@ class TTNNOptimizer : public impl::TTNNOptimizerBase { MemoryLayoutAnalysis memoryLayoutAnalysis = getAnalysis(); memoryLayoutAnalysis.init(MemoryLayoutAnalysisInput( - legalLayouts, chipDesc.getUsableL1Size(), overrideReshardEdges)); + legalLayouts, chipDesc.getUsableL1Size(), overrideReshardEdges, + memoryLayoutAnalysisPolicy)); legalLayouts = memoryLayoutAnalysis.getResult().legalLayouts; opSchedule = memoryLayoutAnalysis.getResult().schedule; memReconfigEdges = memoryLayoutAnalysis.getResult().memReconfigEdges; diff --git a/test/ttmlir/Dialect/TTNN/sharding_matmul_override_0.mlir b/test/ttmlir/Dialect/TTNN/sharding_matmul_override_0.mlir index 9516f96f5..2e07f7f5c 100644 --- a/test/ttmlir/Dialect/TTNN/sharding_matmul_override_0.mlir +++ b/test/ttmlir/Dialect/TTNN/sharding_matmul_override_0.mlir @@ -2,7 +2,7 @@ #any_device_tile = #tt.operand_constraint module attributes {} { func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>, %arg2: tensor<96x64xbf16>) -> tensor<64x64xbf16> { - // CHECK: #[[LAYOUT_7:layout7]] = #tt.layout<{{.*}}, memref<{{.*}}>, #dram>, {{.*}}> + // CHECK: #[[LAYOUT_7:layout7]] = #tt.layout<{{.*}}, memref<{{.*}}, #dram>, {{.*}}> %0 = tensor.empty() : tensor<64x96xbf16> // CHECK: {{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_7]]> %1 = "ttir.matmul"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16> diff --git a/test/ttmlir/Dialect/TTNN/sharding_matmul_override_32.mlir b/test/ttmlir/Dialect/TTNN/sharding_matmul_override_32.mlir index 3e26d1490..8e984348f 100644 --- a/test/ttmlir/Dialect/TTNN/sharding_matmul_override_32.mlir +++ b/test/ttmlir/Dialect/TTNN/sharding_matmul_override_32.mlir @@ -3,7 +3,7 @@ module attributes {} { func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>, %arg2: tensor<96x64xbf16>) -> tensor<64x64xbf16> { // CHECK: #[[L1_:.*]] = #tt.memory_space - // CHECK: #[[LAYOUT_7:layout7]] = #tt.layout<{{.*}}, memref<{{.*}}>, #l1_>, {{.*}}> + // CHECK: #[[LAYOUT_7:layout7]] = #tt.layout<{{.*}}, memref<{{.*}}, #l1_>, {{.*}}> %0 = tensor.empty() : tensor<64x96xbf16> // CHECK: {{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_7]]> %1 = "ttir.matmul"(%arg0, %arg1, %0) <{operand_constraints = [#any_device_tile, #any_device_tile, #any_device_tile]}> : (tensor<64x128xbf16>, tensor<128x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16> diff --git a/test/ttmlir/Silicon/TTNN/all_l1_interleaved_policy.mlir b/test/ttmlir/Silicon/TTNN/all_l1_interleaved_policy.mlir new file mode 100644 index 000000000..e09552c42 --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/all_l1_interleaved_policy.mlir @@ -0,0 +1,31 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<128x96xbf16>, %arg2: tensor<64x96xbf16>, %arg3: tensor<96x32xbf16>, %arg4: tensor<64x32xbf16>) -> tensor<64x32xbf16> { + // CHECK: #[[L1_:.*]] = #tt.memory_space + // CHECK: #[[LAYOUT_6:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #l1_>, interleaved> + // CHECK: #[[LAYOUT_7:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #l1_>, interleaved> + // CHECK: #[[LAYOUT_8:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #dram>, interleaved> + %0 = tensor.empty() : tensor<64x96xbf16> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_6]]> + %1 = "ttir.matmul"(%arg0, %arg1, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<128x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16> + %2 = tensor.empty() : tensor<64x96xbf16> + // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_6]]> + %3 = "ttir.add"(%1, %arg2, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x96xbf16>, tensor<64x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16> + %4 = tensor.empty() : tensor<64x96xbf16> + // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x96xbf16, #[[LAYOUT_6]]> + %5 = "ttir.relu"(%3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x96xbf16>, tensor<64x96xbf16>) -> tensor<64x96xbf16> + %6 = tensor.empty() : tensor<64x32xbf16> + // CHECK: %{{.*}} = "ttnn.matmul"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_7]]> + %7 = "ttir.matmul"(%5, %arg3, %6) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x96xbf16>, tensor<96x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16> + %8 = tensor.empty() : tensor<64x32xbf16> + // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_7]]> + %9 = "ttir.add"(%7, %arg4, %8) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x32xbf16>, tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16> + %10 = tensor.empty() : tensor<64x32xbf16> + // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<64x32xbf16, #[[LAYOUT_8]]> + %11 = "ttir.relu"(%9, %10) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x32xbf16>, tensor<64x32xbf16>) -> tensor<64x32xbf16> + return %11 : tensor<64x32xbf16> + } +} diff --git a/test/ttmlir/Silicon/TTNN/large_tensors.mlir b/test/ttmlir/Silicon/TTNN/large_tensors.mlir new file mode 100644 index 000000000..b258435db --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/large_tensors.mlir @@ -0,0 +1,19 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<8192x8192xbf16>, %arg1: tensor<8192x8192xbf16>, %arg2: tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> { + // CHECK: #[[LAYOUT_2:layout2]] = #tt.layout<{{.*}}, memref<{{.*}}, #dram>, {{.*}}> + %0 = tensor.empty() : tensor<8192x8192xbf16> + // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> + %2 = tensor.empty() : tensor<8192x8192xbf16> + // CHECK: %{{.*}} = "ttnn.add"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]> + %3 = "ttir.add"(%1, %arg2, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> + %4 = tensor.empty() : tensor<8192x8192xbf16> + // CHECK: %{{.*}} = "ttnn.relu"{{.*}} -> tensor<8192x8192xbf16, #[[LAYOUT_2]]> + %7 = "ttir.relu"(%3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<8192x8192xbf16>, tensor<8192x8192xbf16>) -> tensor<8192x8192xbf16> + return %7 : tensor<8192x8192xbf16> + } +} diff --git a/test/ttmlir/Silicon/TTNN/mnist_l1_interleaved.mlir b/test/ttmlir/Silicon/TTNN/mnist_l1_interleaved.mlir new file mode 100644 index 000000000..bd001a267 --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/mnist_l1_interleaved.mlir @@ -0,0 +1,44 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +#any_device = #tt.operand_constraint +#loc = loc("MNISTLinear":4294967295:0) +module @"tt-forge-graph" attributes {} { + func.func @main(%arg0: tensor<1x784xf32> loc("MNISTLinear":4294967295:0), %arg1: tensor<1x10xf32> loc("MNISTLinear":4294967295:0), %arg2: tensor<256x10xf32> loc("MNISTLinear":4294967295:0), %arg3: tensor<1x256xf32> loc("MNISTLinear":4294967295:0), %arg4: tensor<784x256xf32> loc("MNISTLinear":4294967295:0)) -> tensor<1x10xf32> { + // CHECK: #[[LAYOUT_6:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #l1_>, interleaved> + // CHECK: #[[LAYOUT_7:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #l1_>, interleaved> + // CHECK: #[[LAYOUT_8:.*]] = #tt.layout<(d0, d1) -> (d0, d1), undef, <{{.*}}>, memref<{{.*}}, #dram>, interleaved> + %0 = tensor.empty() : tensor<1x256xf32> loc(#loc8) + // CHECK: %[[C:.*]] = "ttnn.matmul"[[C:.*]] -> tensor<1x256xf32, #[[LAYOUT_6]]> + %1 = "ttir.matmul"(%arg0, %arg4, %0) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x784xf32>, tensor<784x256xf32>, tensor<1x256xf32>) -> tensor<1x256xf32> loc(#loc8) + %2 = tensor.empty() : tensor<1x256xf32> loc(#loc9) + // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x256xf32, #[[LAYOUT_6]]> + %3 = "ttir.add"(%1, %arg3, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x256xf32>, tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x256xf32> loc(#loc9) + %4 = tensor.empty() : tensor<1x256xf32> loc(#loc10) + // CHECK: %[[C:.*]] = "ttnn.relu"[[C:.*]] -> tensor<1x256xf32, #[[LAYOUT_6]]> + %5 = "ttir.relu"(%3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x256xf32> loc(#loc10) + %6 = tensor.empty() : tensor<1x10xf32> loc(#loc11) + // CHECK: %[[C:.*]] = "ttnn.matmul"[[C:.*]] -> tensor<1x10xf32, #[[LAYOUT_7]]> + %7 = "ttir.matmul"(%5, %arg2, %6) <{operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x256xf32>, tensor<256x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> loc(#loc11) + %8 = tensor.empty() : tensor<1x10xf32> loc(#loc12) + // CHECK: %[[C:.*]] = "ttnn.add"[[C:.*]] -> tensor<1x10xf32, #[[LAYOUT_7]]> + %9 = "ttir.add"(%7, %arg1, %8) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<1x10xf32>, tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> loc(#loc12) + %10 = tensor.empty() : tensor<1x10xf32> loc(#loc13) + // CHECK: %{{.*}} = "ttnn.softmax"{{.*}} -> tensor<1x10xf32, #[[LAYOUT_8]]> + %11 = "ttir.softmax"(%9, %10) <{dimension = 1 : si32, operand_constraints = [#any_device, #any_device]}> : (tensor<1x10xf32>, tensor<1x10xf32>) -> tensor<1x10xf32> loc(#loc13) + return %11 : tensor<1x10xf32> loc(#loc7) + } loc(#loc) +} loc(#loc) +#loc1 = loc("MNISTLinear":4294967295:10) +#loc2 = loc("MNISTLinear":4294967295:8) +#loc3 = loc("MNISTLinear":4294967295:6) +#loc4 = loc("MNISTLinear":4294967295:4) +#loc5 = loc("MNISTLinear":4294967295:3) +#loc6 = loc("MNISTLinear":4294967295:2) +#loc7 = loc(unknown) +#loc8 = loc("matmul_1"(#loc1)) +#loc9 = loc("add_2"(#loc2)) +#loc10 = loc("relu_3"(#loc3)) +#loc11 = loc("matmul_5"(#loc4)) +#loc12 = loc("add_6"(#loc5)) +#loc13 = loc("softmax_7"(#loc6)) diff --git a/test/ttmlir/Silicon/TTNN/simple_fork_join.mlir b/test/ttmlir/Silicon/TTNN/simple_fork_join.mlir new file mode 100644 index 000000000..981c26b49 --- /dev/null +++ b/test/ttmlir/Silicon/TTNN/simple_fork_join.mlir @@ -0,0 +1,18 @@ +// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-optimizer=true memory-layout-analysis-enabled=true memory-layout-analysis-policy=L1Interleaved" %s > %t.mlir +// RUN: FileCheck %s --input-file=%t.mlir +// RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn +// UNSUPPORTED: true +#any_device = #tt.operand_constraint +module attributes {} { + func.func @forward(%arg0: tensor<64x128xbf16>, %arg1: tensor<64x128xbf16>, %arg2: tensor<64x128xbf16>, %arg3: tensor<64x128xbf16>) -> tensor<64x128xbf16> { + %0 = tensor.empty() : tensor<64x128xbf16> + %1 = "ttir.add"(%arg0, %arg1, %0) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16> + %2 = tensor.empty() : tensor<64x128xbf16> + %3 = "ttir.add"(%arg2, %arg3, %2) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16> + %4 = tensor.empty() : tensor<64x128xbf16> + %5 = "ttir.add"(%1, %3, %4) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16> + %6 = tensor.empty() : tensor<64x128xbf16> + %7 = "ttir.relu"(%5, %6) <{operandSegmentSizes = array, operand_constraints = [#any_device, #any_device]}> : (tensor<64x128xbf16>, tensor<64x128xbf16>) -> tensor<64x128xbf16> + return %7 : tensor<64x128xbf16> + } +}