From 7757540998d6e3485d5960efff4dd9003d73d93c Mon Sep 17 00:00:00 2001 From: Tapasvi Patel Date: Tue, 7 Jan 2025 23:38:33 +0000 Subject: [PATCH] #1683: Updated memory collection and report of dram and l1 using new metal APIs. --- runtime/include/tt/runtime/detail/ttmetal.h | 6 + runtime/include/tt/runtime/detail/ttnn.h | 6 + runtime/include/tt/runtime/runtime.h | 3 + runtime/include/tt/runtime/types.h | 12 + runtime/lib/runtime.cpp | 35 +++ runtime/lib/ttmetal/runtime.cpp | 48 ++++ runtime/lib/ttnn/runtime.cpp | 45 ++++ runtime/tools/python/ttrt/common/callback.py | 228 ++++++------------- runtime/tools/python/ttrt/common/run.py | 64 +----- runtime/tools/python/ttrt/runtime/module.cpp | 24 +- 10 files changed, 250 insertions(+), 221 deletions(-) diff --git a/runtime/include/tt/runtime/detail/ttmetal.h b/runtime/include/tt/runtime/detail/ttmetal.h index deff10755..9827a0657 100644 --- a/runtime/include/tt/runtime/detail/ttmetal.h +++ b/runtime/include/tt/runtime/detail/ttmetal.h @@ -43,6 +43,12 @@ void deallocateBuffers(Device device); void dumpMemoryReport(Device device); +std::unordered_map +getDramMemoryView(Device device); + +std::unordered_map +getL1MemoryView(Device device); + void wait(Event event); void wait(Tensor tensor); diff --git a/runtime/include/tt/runtime/detail/ttnn.h b/runtime/include/tt/runtime/detail/ttnn.h index 2310789b6..88009d2bf 100644 --- a/runtime/include/tt/runtime/detail/ttnn.h +++ b/runtime/include/tt/runtime/detail/ttnn.h @@ -93,6 +93,12 @@ void deallocateBuffers(Device device); void dumpMemoryReport(Device device); +std::unordered_map +getDramMemoryView(Device device); + +std::unordered_map +getL1MemoryView(Device device); + void wait(Event event); void wait(Tensor tensor); diff --git a/runtime/include/tt/runtime/runtime.h b/runtime/include/tt/runtime/runtime.h index 2f278ffc1..9414b022e 100644 --- a/runtime/include/tt/runtime/runtime.h +++ b/runtime/include/tt/runtime/runtime.h @@ -7,6 +7,7 @@ #include #include +#include #include #include "tt/runtime/types.h" @@ -20,6 +21,8 @@ std::pair getCurrentSystemDesc(); namespace detail { void deallocateBuffers(Device device); void dumpMemoryReport(Device device); +std::unordered_map getDramMemoryView(Device device); +std::unordered_map getL1MemoryView(Device device); } // namespace detail DeviceRuntime getCurrentRuntime(); diff --git a/runtime/include/tt/runtime/types.h b/runtime/include/tt/runtime/types.h index cc2791e23..ecb7912b4 100644 --- a/runtime/include/tt/runtime/types.h +++ b/runtime/include/tt/runtime/types.h @@ -67,6 +67,18 @@ struct RuntimeCheckedObjectImpl { } }; +struct MemoryView { + std::uint64_t num_banks; + size_t bytes_allocatable_per_bank; + size_t bytes_allocated_per_bank; + size_t bytes_free_per_bank; + size_t total_bytes_allocatable; // bytes_allocatable_per_bank * num_banks + size_t total_bytes_allocated; // bytes_allocated_per_bank * num_banks + size_t total_bytes_free; // bytes_free_per_bank * num_banks + size_t largest_contiguous_bytes_free_per_bank; + std::vector> block_table; +}; + } // namespace detail struct TensorDesc { diff --git a/runtime/lib/runtime.cpp b/runtime/lib/runtime.cpp index c25cfed51..6bff94eff 100644 --- a/runtime/lib/runtime.cpp +++ b/runtime/lib/runtime.cpp @@ -60,6 +60,41 @@ void dumpMemoryReport(Device device) { LOG_FATAL("runtime is not enabled"); } + +std::unordered_map +getDramMemoryView(Device device) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + return ::tt::runtime::ttnn::getDramMemoryView(device); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + return ::tt::runtime::ttmetal::getDramMemoryView(device); + } +#endif + + LOG_FATAL("runtime is not enabled"); +} + +std::unordered_map +getL1MemoryView(Device device) { +#if defined(TT_RUNTIME_ENABLE_TTNN) + if (getCurrentRuntime() == DeviceRuntime::TTNN) { + return ::tt::runtime::ttnn::getL1MemoryView(device); + } +#endif + +#if defined(TT_RUNTIME_ENABLE_TTMETAL) + if (getCurrentRuntime() == DeviceRuntime::TTMetal) { + return ::tt::runtime::ttmetal::getL1MemoryView(device); + } +#endif + + LOG_FATAL("runtime is not enabled"); +} + } // namespace detail DeviceRuntime getCurrentRuntime() { diff --git a/runtime/lib/ttmetal/runtime.cpp b/runtime/lib/ttmetal/runtime.cpp index 68322154d..b3ab854f3 100644 --- a/runtime/lib/ttmetal/runtime.cpp +++ b/runtime/lib/ttmetal/runtime.cpp @@ -33,6 +33,21 @@ static Tensor createNullTensor() { return Tensor(nullptr, nullptr, DeviceRuntime::TTMetal); } +static tt::runtime::detail::MemoryView +createMemoryView(const tt::tt_metal::detail::MemoryView &memoryView) { + return tt::runtime::detail::MemoryView{ + .num_banks = memoryView.num_banks, + .bytes_allocatable_per_bank = memoryView.bytes_allocatable_per_bank, + .bytes_allocated_per_bank = memoryView.bytes_allocated_per_bank, + .bytes_free_per_bank = memoryView.bytes_free_per_bank, + .total_bytes_allocatable = memoryView.total_bytes_allocatable, + .total_bytes_allocated = memoryView.total_bytes_allocated, + .total_bytes_free = memoryView.total_bytes_free, + .largest_contiguous_bytes_free_per_bank = + memoryView.largest_contiguous_bytes_free_per_bank, + .block_table = memoryView.blockTable}; +} + Tensor createTensor(std::shared_ptr data, std::vector const &shape, std::vector const &stride, @@ -113,6 +128,39 @@ void dumpMemoryReport(Device deviceHandle) { } } +std::unordered_map +getDramMemoryView(Device deviceHandle) { + std::unordered_map memoryMap; + + ::tt::tt_metal::distributed::MeshDevice &meshDevice = + deviceHandle.as<::tt::tt_metal::distributed::MeshDevice>( + DeviceRuntime::TTMetal); + + for (::tt::tt_metal::Device *device : meshDevice.get_devices()) { + auto dramMemoryView = ::tt::tt_metal::detail::GetDramMemoryView(device); + memoryMap[device->id()] = createMemoryView(dramMemoryView); + } + + return memoryMap; +} + +std::unordered_map +getL1MemoryView(Device deviceHandle) { + std::unordered_map memoryMap; + + ::tt::tt_metal::distributed::MeshDevice &meshDevice = + deviceHandle.as<::tt::tt_metal::distributed::MeshDevice>( + DeviceRuntime::TTMetal); + + for (::tt::tt_metal::Device *device : meshDevice.get_devices()) { + auto l1MemoryView = ::tt::tt_metal::detail::GetL1MemoryView(device); + memoryMap[device->id()] = memoryMap[device->id()] = + createMemoryView(l1MemoryView); + } + + return memoryMap; +} + void wait(Event event) { Events events = event.as(DeviceRuntime::TTMetal); for (auto e : events) { diff --git a/runtime/lib/ttnn/runtime.cpp b/runtime/lib/ttnn/runtime.cpp index 3fd7ba1b9..b6ba893a2 100644 --- a/runtime/lib/ttnn/runtime.cpp +++ b/runtime/lib/ttnn/runtime.cpp @@ -87,6 +87,21 @@ static Tensor createNullTensor() { return Tensor(nullptr, nullptr, DeviceRuntime::TTNN); } +static tt::runtime::detail::MemoryView +createMemoryView(const tt::tt_metal::detail::MemoryView &memoryView) { + return tt::runtime::detail::MemoryView{ + .num_banks = memoryView.num_banks, + .bytes_allocatable_per_bank = memoryView.bytes_allocatable_per_bank, + .bytes_allocated_per_bank = memoryView.bytes_allocated_per_bank, + .bytes_free_per_bank = memoryView.bytes_free_per_bank, + .total_bytes_allocatable = memoryView.total_bytes_allocatable, + .total_bytes_allocated = memoryView.total_bytes_allocated, + .total_bytes_free = memoryView.total_bytes_free, + .largest_contiguous_bytes_free_per_bank = + memoryView.largest_contiguous_bytes_free_per_bank, + .block_table = memoryView.blockTable}; +} + static DeviceVariant getTargetDevice(::ttnn::MeshDevice &meshDevice) { if (meshDevice.num_devices() == 1) { return std::ref(*(meshDevice.get_device_index(0))); @@ -222,6 +237,36 @@ void dumpMemoryReport(Device deviceHandle) { } } +std::unordered_map +getDramMemoryView(Device deviceHandle) { + std::unordered_map memoryMap; + + ::ttnn::MeshDevice &meshDevice = + deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN); + + for (::ttnn::Device *device : meshDevice.get_devices()) { + auto dramMemoryView = ::tt::tt_metal::detail::GetDramMemoryView(device); + memoryMap[device->id()] = createMemoryView(dramMemoryView); + } + + return memoryMap; +} + +std::unordered_map +getL1MemoryView(Device deviceHandle) { + std::unordered_map memoryMap; + + ::ttnn::MeshDevice &meshDevice = + deviceHandle.as<::ttnn::MeshDevice>(DeviceRuntime::TTNN); + + for (::ttnn::Device *device : meshDevice.get_devices()) { + auto l1MemoryView = ::tt::tt_metal::detail::GetL1MemoryView(device); + memoryMap[device->id()] = createMemoryView(l1MemoryView); + } + + return memoryMap; +} + void wait(Event event) { // Nothing to do for ttnn runtime LOG_ASSERT(event.matchesRuntime(DeviceRuntime::TTNN)); diff --git a/runtime/tools/python/ttrt/common/callback.py b/runtime/tools/python/ttrt/common/callback.py index 56223224f..26480eeef 100644 --- a/runtime/tools/python/ttrt/common/callback.py +++ b/runtime/tools/python/ttrt/common/callback.py @@ -62,6 +62,43 @@ def save_memory_report(self, memory_report_path): self.logging.debug(f"Saved memory report to={memory_report_path}") + def check_pcc(self): + for loc, golden_data in self.golden_report.items(): + if golden_data["actual_pcc"] < golden_data["expected_pcc"]: + raise Exception( + f"Failed: golden comparison failed, actual_pcc={golden_data['actual_pcc']} < expected_pcc={golden_data['expected_pcc']}" + ) + + def check_memory_leak(self): + num_items = 0 + for key, value in self.memory_report.items(): + num_items += 1 + + if num_items == 0: + self.logging.warning(f"No memory data found") + else: + # query initial memory usage + dram_initial_size = self.memory_report[0]["dram"][ + "total_allocated (bytes) : total_allocated/bank * num_banks" + ] + l1_initlal_size = self.memory_report[0]["l1"][ + "total_allocated (bytes) : total_allocated/bank * num_banks" + ] + + # query final memory usage and ensure no memory leaks + dram_final_size = self.memory_report[num_items - 1]["dram"][ + "total_allocated (bytes) : total_allocated/bank * num_banks" + ] + l1_final_size = self.memory_report[num_items - 1]["l1"][ + "total_allocated (bytes) : total_allocated/bank * num_banks" + ] + + if dram_final_size > dram_initial_size: + raise Exception("Memory leak detected in DRAM") + + if l1_final_size > l1_initlal_size: + raise Exception("Memory leak detected in L1 cache") + """ -----------------------GOLDEN CALLBACK----------------------- @@ -234,153 +271,6 @@ def golden(callback_runtime_config, binary, program_context, op_context): """ -def add_key(dram_memory_usage, l1_memory_usage, current_section, key, value): - if current_section == "DRAM": - dram_memory_usage[key] = value - elif current_section == "L1": - l1_memory_usage[key] = value - - -def parse_detailed_memory_usage_file(dram_memory_usage, l1_memory_usage, file_path): - current_section = None - - with open(file_path, "r") as file: - reader = csv.reader(file) - blocks = [] - - for row in reader: - if not any(row): - continue - - if row[1].strip() == "DRAM": - current_section = "DRAM" - elif row[1].strip() == "L1": - current_section = "L1" - elif "Total" in row[1]: - if row[1].strip() == "Total allocatable (B):": - add_key( - dram_memory_usage, - l1_memory_usage, - current_section, - "total_allocatable (bytes) : total_allocatable/bank * num_banks", - row[2].strip(), - ) - elif row[1].strip() == "Total allocated (B):": - add_key( - dram_memory_usage, - l1_memory_usage, - current_section, - "total_allocated (bytes) : total_allocated/bank * num_banks", - row[2].strip(), - ) - elif row[1].strip() == "Total free (B):": - add_key( - dram_memory_usage, - l1_memory_usage, - current_section, - "total_free (bytes) : total_allocatable - total_allocated", - row[2].strip(), - ) - elif "Blocks" in row[2]: - blocks = [] - else: - block = {} - block["address (bytes)"] = row[3].strip() - block["size (bytes)"] = row[4].strip() - block["allocated (y/n)"] = row[5].strip() - - blocks.append(block) - add_key( - dram_memory_usage, - l1_memory_usage, - current_section, - "blocks", - blocks, - ) - - -def parse_memory_usage_summary_file(dram_memory_usage, l1_memory_usage, file_path): - with open(file_path, "r") as file: - reader = csv.reader(file) - current_section = "DRAM" - - for row in reader: - if not any(row): - continue - - if "Total Allocatable Size" in row[1]: - continue - - add_key( - dram_memory_usage, - l1_memory_usage, - current_section, - "total_allocatable (bytes) : per bank", - row[1].strip(), - ) - add_key( - dram_memory_usage, - l1_memory_usage, - current_section, - "total_allocated (bytes): per bank", - row[2].strip(), - ) - add_key( - dram_memory_usage, - l1_memory_usage, - current_section, - "total_free (bytes) : per bank", - row[3].strip(), - ) - add_key( - dram_memory_usage, - l1_memory_usage, - current_section, - "largest_free_block (bytes) : per bank", - row[4].strip(), - ) - - if current_section == "DRAM": - current_section = "L1" - - -def parse_l1_usage_summary_file(dram_memory_usage, l1_memory_usage, file_path): - with open(file_path, "r") as file: - reader = csv.reader(file) - dram_row = True - - for index, row in enumerate(reader): - if index == 2: - add_key( - dram_memory_usage, - l1_memory_usage, - "L1", - "largest_contiguous_free_block (bytes) : per bank", - row[1].strip(), - ) - - -def parse_memory_csv_files( - detailed_memory_usage_file_path, - memory_usage_summary_file_path, - l1_usage_summary_file_path, -): - dram_memory_usage = {} - l1_memory_usage = {} - - parse_detailed_memory_usage_file( - dram_memory_usage, l1_memory_usage, detailed_memory_usage_file_path - ) - parse_memory_usage_summary_file( - dram_memory_usage, l1_memory_usage, memory_usage_summary_file_path - ) - parse_l1_usage_summary_file( - dram_memory_usage, l1_memory_usage, l1_usage_summary_file_path - ) - - return dram_memory_usage, l1_memory_usage - - def memory(callback_runtime_config, binary, program_context, op_context): import ttrt.runtime import ttrt.binary @@ -391,21 +281,43 @@ def memory(callback_runtime_config, binary, program_context, op_context): loc = ttrt.runtime.get_op_loc_info(op_context) debug_str = ttrt.runtime.get_op_debug_str(op_context) - device.dump_memory_report() - memory_dump_dir_path = f"{get_ttrt_metal_home_path()}/generated/reports" - - # read generated memory reports and store in condensed memory_report - dram_memory_usage, l1_memory_usage = parse_memory_csv_files( - f"{memory_dump_dir_path}/detailed_memory_usage.csv", - f"{memory_dump_dir_path}/memory_usage_summary.csv", - f"{memory_dump_dir_path}/l1_usage_summary.csv", - ) + dram_memory_view = device.get_dram_memory_view() + l1_memory_view = device.get_l1_memory_view() op_memory_report = {} op_memory_report["loc"] = loc op_memory_report["debug_str"] = debug_str - op_memory_report["dram"] = dram_memory_usage - op_memory_report["l1"] = l1_memory_usage + + dram_op_device_memory_report = {} + for index, memory_view in dram_memory_view.items(): + dram_op_device_memory_report[str(index)] = { + "num_banks": memory_view.num_banks, + "bytes_allocatable_per_bank": memory_view.bytes_allocatable_per_bank, + "bytes_allocated_per_bank": memory_view.bytes_allocated_per_bank, + "bytes_free_per_bank": memory_view.bytes_free_per_bank, + "total_bytes_allocatable": memory_view.total_bytes_allocatable, + "total_bytes_allocated": memory_view.total_bytes_allocated, + "total_bytes_free": memory_view.total_bytes_free, + "largest_contiguous_bytes_free_per_bank": memory_view.largest_contiguous_bytes_free_per_bank, + "block_table": memory_view.block_table, + } + + l1_op_device_memory_report = {} + for index, memory_view in l1_memory_view.items(): + l1_op_device_memory_report[str(index)] = { + "num_banks": memory_view.num_banks, + "bytes_allocatable_per_bank": memory_view.bytes_allocatable_per_bank, + "bytes_allocated_per_bank": memory_view.bytes_allocated_per_bank, + "bytes_free_per_bank": memory_view.bytes_free_per_bank, + "total_bytes_allocatable": memory_view.total_bytes_allocatable, + "total_bytes_allocated": memory_view.total_bytes_allocated, + "total_bytes_free": memory_view.total_bytes_free, + "largest_contiguous_bytes_free_per_bank": memory_view.largest_contiguous_bytes_free_per_bank, + "block_table": memory_view.block_table, + } + + op_memory_report["dram"] = dram_op_device_memory_report + op_memory_report["l1"] = l1_op_device_memory_report callback_runtime_config.memory_report[ callback_runtime_config.callback_counter() ] = op_memory_report diff --git a/runtime/tools/python/ttrt/common/run.py b/runtime/tools/python/ttrt/common/run.py index b83c5d390..d5f468739 100644 --- a/runtime/tools/python/ttrt/common/run.py +++ b/runtime/tools/python/ttrt/common/run.py @@ -589,17 +589,7 @@ def _execute(binaries): f"{self.artifacts.get_binary_folder_path(bin)}/run/program_{program_index}/golden_results.json" ) - for ( - loc, - golden_data, - ) in callback_runtime_config.golden_report.items(): - if ( - golden_data["actual_pcc"] - < golden_data["expected_pcc"] - ): - raise Exception( - f"Failed: golden comparison failed for program={program_index}, actual_pcc={golden_data['actual_pcc']} < expected_pcc={golden_data['expected_pcc']}" - ) + callback_runtime_config.check_pcc() if self["--memory"]: if self["--save-artifacts"]: @@ -608,57 +598,7 @@ def _execute(binaries): ) if self["--check-memory-leak"]: - num_items = 0 - for ( - key, - value, - ) in callback_runtime_config.memory_report.items(): - num_items += 1 - - if num_items == 0: - self.logging.warning(f"No memory data found") - else: - # query initial memory usage - dram_initial_size = callback_runtime_config.memory_report[ - 0 - ][ - "dram" - ][ - "total_allocated (bytes) : total_allocated/bank * num_banks" - ] - l1_initlal_size = callback_runtime_config.memory_report[ - 0 - ][ - "l1" - ][ - "total_allocated (bytes) : total_allocated/bank * num_banks" - ] - - # query final memory usage and ensure no memory leaks - dram_final_size = callback_runtime_config.memory_report[ - num_items - 1 - ][ - "dram" - ][ - "total_allocated (bytes) : total_allocated/bank * num_banks" - ] - l1_final_size = callback_runtime_config.memory_report[ - num_items - 1 - ][ - "l1" - ][ - "total_allocated (bytes) : total_allocated/bank * num_banks" - ] - - if dram_final_size > dram_initial_size: - raise Exception( - "Memory leak detected in DRAM" - ) - - if l1_final_size > l1_initlal_size: - raise Exception( - "Memory leak detected in L1 cache" - ) + callback_runtime_config.check_memory_leak() except Exception as e: test_result = { diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp index 4c3eb8c69..df5db5f58 100644 --- a/runtime/tools/python/ttrt/runtime/module.cpp +++ b/runtime/tools/python/ttrt/runtime/module.cpp @@ -21,9 +21,31 @@ namespace py = pybind11; PYBIND11_MODULE(_C, m) { m.doc() = "ttrt.runtime python extension for interacting with the " "Tenstorrent devices"; + py::class_(m, "MemoryView") + .def_readonly("num_banks", &tt::runtime::detail::MemoryView::num_banks) + .def_readonly( + "bytes_allocatable_per_bank", + &tt::runtime::detail::MemoryView::bytes_allocatable_per_bank) + .def_readonly("bytes_allocated_per_bank", + &tt::runtime::detail::MemoryView::bytes_allocated_per_bank) + .def_readonly("bytes_free_per_bank", + &tt::runtime::detail::MemoryView::bytes_free_per_bank) + .def_readonly("total_bytes_allocatable", + &tt::runtime::detail::MemoryView::total_bytes_allocatable) + .def_readonly("total_bytes_allocated", + &tt::runtime::detail::MemoryView::total_bytes_allocated) + .def_readonly("total_bytes_free", + &tt::runtime::detail::MemoryView::total_bytes_free) + .def_readonly("largest_contiguous_bytes_free_per_bank", + &tt::runtime::detail::MemoryView:: + largest_contiguous_bytes_free_per_bank) + .def_readonly("block_table", + &tt::runtime::detail::MemoryView::block_table); py::class_(m, "Device") .def("deallocate_buffers", &tt::runtime::detail::deallocateBuffers) - .def("dump_memory_report", &tt::runtime::detail::dumpMemoryReport); + .def("dump_memory_report", &tt::runtime::detail::dumpMemoryReport) + .def("get_dram_memory_view", &tt::runtime::detail::getDramMemoryView) + .def("get_l1_memory_view", &tt::runtime::detail::getL1MemoryView); py::class_(m, "Event"); py::class_(m, "Tensor"); py::class_(m, "Layout");