From 41b486c0c79514c74a541669cf0e8210beb91449 Mon Sep 17 00:00:00 2001 From: Tapasvi Patel <133996364+tapspatel@users.noreply.github.com> Date: Wed, 11 Dec 2024 11:07:21 -0600 Subject: [PATCH] #1507: Added ability to gather golden result information and consolidate into report as well as store golden and device artifacts generated during runtime (#1544) --- runtime/include/tt/runtime/detail/debug.h | 8 +- runtime/tools/python/ttrt/common/golden.py | 77 +++++++++++++++++++- runtime/tools/python/ttrt/common/run.py | 58 ++++++++++++++- runtime/tools/python/ttrt/common/util.py | 19 +++-- runtime/tools/python/ttrt/runtime/module.cpp | 8 ++ 5 files changed, 155 insertions(+), 15 deletions(-) diff --git a/runtime/include/tt/runtime/detail/debug.h b/runtime/include/tt/runtime/detail/debug.h index ed049df2d..829f23660 100644 --- a/runtime/include/tt/runtime/detail/debug.h +++ b/runtime/include/tt/runtime/detail/debug.h @@ -63,13 +63,19 @@ struct Hooks { #endif } + void unregisterHooks() const { +#if defined(TT_RUNTIME_DEBUG) && TT_RUNTIME_DEBUG == 1 + operatorCallback = std::nullopt; +#endif + } + private: #if defined(TT_RUNTIME_DEBUG) && TT_RUNTIME_DEBUG == 1 Hooks(std::optional> operatorCallback) : operatorCallback(operatorCallback) {} - std::optional> + mutable std::optional> operatorCallback; #else constexpr Hooks() = default; diff --git a/runtime/tools/python/ttrt/common/golden.py b/runtime/tools/python/ttrt/common/golden.py index 7bab624f6..2666c1f2e 100644 --- a/runtime/tools/python/ttrt/common/golden.py +++ b/runtime/tools/python/ttrt/common/golden.py @@ -16,10 +16,27 @@ import shutil import atexit import re +from functools import partial from ttrt.common.util import * +class GoldenRuntimeConfig: + def __init__( + self, + atol=1e-08, + rtol=1e-05, + pcc=0.99, + artifact_dir="", + save_golden_tensors=False, + ): + self.artifact_dir = artifact_dir + self.pcc = pcc + self.atol = atol + self.rtol = rtol + self.save_golden_tensors = save_golden_tensors + + def get_atol_rtol_pcc(golden, calculated): import numpy as np import torch @@ -103,7 +120,9 @@ def get_pcc(golden, calculated): ) -def golden(binary, programContext, opContext): +def golden_partial_function( + golden_runtime_config, golden_results_data, binary, program_context, op_context +): import torch import ttrt.runtime import ttrt.binary @@ -111,12 +130,23 @@ def golden(binary, programContext, opContext): print("-----------executing golden comparision-----------") try: - loc = ttrt.runtime.get_op_loc_info(opContext) + op_debug_str = ttrt.runtime.get_op_debug_str(op_context) + # find matching golden tensor based on loc in op debug string + match = re.search(r"loc\(([^)]+)\)", op_debug_str) + + if not match: + print(f"debug_str={op_debug_str}") + print("No location found in debug string - skipping golden comparison") + return + + loc = match.group(1).replace('"', "") print(f"found location={loc}") op_golden_tensor = binary.get_debug_info_golden(loc) - op_output_tensor = ttrt.runtime.get_op_output_tensor(opContext, programContext) + op_output_tensor = ttrt.runtime.get_op_output_tensor( + op_context, program_context + ) if len(op_golden_tensor) == 0: print("Golden tensor is empty - skipping golden comparison") @@ -139,11 +169,52 @@ def golden(binary, programContext, opContext): op_output_tensor, dtype=torch.float32 ).flatten() + if golden_runtime_config.save_golden_tensors: + torch.save( + golden_tensor_torch, + f"{golden_runtime_config.artifact_dir}/{loc}_golden.pt", + ) + torch.save( + output_tensor_torch, + f"{golden_runtime_config.artifact_dir}/{loc}_device.pt", + ) + _, _, cal_pcc, output_str = get_atol_rtol_pcc( golden_tensor_torch, output_tensor_torch ) print(f"PCC={cal_pcc}") print(output_str) + + results = {} + results["expected_pcc"] = golden_runtime_config.pcc + results["actual_pcc"] = cal_pcc + results["atol"] = golden_runtime_config.atol + results["rtol"] = golden_runtime_config.rtol + results["allclose"] = torch.allclose( + golden_tensor_torch, + output_tensor_torch, + atol=golden_runtime_config.atol, + rtol=golden_runtime_config.rtol, + ) + results["max"] = torch.max( + torch.abs(golden_tensor_torch - output_tensor_torch) + ).item() + results["mean_absolute_error"] = torch.mean( + torch.abs(golden_tensor_torch - output_tensor_torch) + ).item() + results["root_mean_square_error"] = torch.sqrt( + torch.mean((golden_tensor_torch - output_tensor_torch) ** 2) + ).item() + results["cosine_similarity"] = torch.nn.functional.cosine_similarity( + golden_tensor_torch.unsqueeze(0), output_tensor_torch.unsqueeze(0) + ).item() + + golden_results_data[loc] = results + finally: print("-----------finished executing golden comparision-----------") + + +def get_golden_fn(golden_runtime_config, golden_results_data): + return partial(golden_partial_function, golden_runtime_config, golden_results_data) diff --git a/runtime/tools/python/ttrt/common/run.py b/runtime/tools/python/ttrt/common/run.py index 19ad61e24..b6a6bf51d 100644 --- a/runtime/tools/python/ttrt/common/run.py +++ b/runtime/tools/python/ttrt/common/run.py @@ -18,7 +18,7 @@ from ttrt.common.util import * from ttrt.common.query import Query -from ttrt.common.golden import golden +from ttrt.common.golden import get_golden_fn, GoldenRuntimeConfig class Run: @@ -103,6 +103,13 @@ def initialize_api(): choices=None, help="atol for golden test", ) + Run.register_arg( + name="--pcc", + type=float, + default=0.99, + choices=None, + help="pcc for golden test", + ) Run.register_arg( name="--seed", type=int, @@ -159,6 +166,13 @@ def initialize_api(): choices=[True, False], help="run golden comparison for intermediate and output tensors", ) + Run.register_arg( + name="--save-golden-tensors", + type=bool, + default=False, + choices=[True, False], + help="save golden and device tensors that are compared during callback runtime", + ) Run.register_arg( name="binary", type=str, @@ -348,9 +362,6 @@ def _execute(binaries): self.logging.warning(f"no binaries found to run - returning early") return - if self["--golden"]: - callback_env = ttrt.runtime.DebugHooks.get(golden) - debug_env = ttrt.runtime.DebugEnv.get( self["--load-kernels-from-disk"], self["--enable-async-ttnn"] ) @@ -373,6 +384,9 @@ def _execute(binaries): try: self.logging.info(f"evaluating binary={bin.file_path}") + if self["--save-artifacts"]: + self.artifacts.create_binary_artifacts_folder(bin) + program_indices = [] if self["--program-index"] == "all": program_indices.extend(range(bin.get_num_programs())) @@ -440,6 +454,20 @@ def _execute(binaries): total_outputs.append(outputs) event = None + golden_results_data = {} + if self["--golden"]: + callback_env = ttrt.runtime.DebugHooks.get( + get_golden_fn( + GoldenRuntimeConfig( + self["--atol"], + self["--rtol"], + self["--pcc"], + f"{self.artifacts.get_binary_folder_path(bin)}/run/program_{program_index}", + self["--save-golden-tensors"], + ), + golden_results_data, + ) + ) for loop in range(self["--loops"]): self.logging.debug( f"starting loop={loop+1}/{self['--loops']} for binary={bin.file_path}" @@ -519,6 +547,28 @@ def _execute(binaries): self.logging.debug(f"{tensor}\n") device.deallocate_buffers() + + # if golden comparison is enabled, check golden results json file to see if test passed + if self["--golden"]: + if self["--save-artifacts"]: + golden_results_file_path = f"{self.artifacts.get_binary_folder_path(bin)}/run/program_{program_index}/golden_results.json" + + with open( + golden_results_file_path, "w" + ) as json_file: + json.dump( + golden_results_data, json_file, indent=4 + ) + + for loc, golden_data in golden_results_data.items(): + if ( + golden_data["actual_pcc"] + < golden_data["expected_pcc"] + ): + raise Exception( + f"Failed: golden comparison failed for program={program_index}, actual_pcc={golden_data['actual_pcc']} < expected_pcc={golden_data['expected_pcc']}" + ) + except Exception as e: test_result = { "file_path": bin.file_path, diff --git a/runtime/tools/python/ttrt/common/util.py b/runtime/tools/python/ttrt/common/util.py index 45e0a9db9..f1b7cfdc4 100644 --- a/runtime/tools/python/ttrt/common/util.py +++ b/runtime/tools/python/ttrt/common/util.py @@ -409,16 +409,22 @@ def clean_artifacts(self): def clean_binary_artifacts(self, binary): self.file_manager.remove_directory(self.get_binary_folder_path(binary)) + def create_binary_artifacts_folder(self, binary): + binary_folder = self.get_binary_folder_path(binary) + self.file_manager.create_directory(binary_folder) + self.file_manager.create_directory(f"{binary_folder}/run") + self.file_manager.create_directory(f"{binary_folder}/perf") + + for program in binary.programs: + program_folder = f"{binary_folder}/run/program_{program.index}" + self.file_manager.create_directory(program_folder) + def save_binary(self, binary, query=None): binary_folder = self.get_binary_folder_path(binary) self.logging.info( f"saving binary={binary.file_path} to binary_folder={binary_folder}" ) - self.file_manager.create_directory(binary_folder) - self.file_manager.create_directory(f"{binary_folder}/run") - self.file_manager.create_directory(f"{binary_folder}/perf") - self.file_manager.copy_file(f"{binary_folder}", binary.file_path) for program in binary.programs: @@ -427,20 +433,19 @@ def save_binary(self, binary, query=None): self.logging.info( f"saving program={program.index} for binary={binary.file_path} to program_folder={program_folder}" ) - self.file_manager.create_directory(program_folder) for i in range(len(program.input_tensors)): self.save_torch_tensor( program_folder, program.input_tensors[i], - f"program_{program.index}_input_{i}.pt", + f"input_{i}.pt", ) for i in range(len(program.output_tensors)): self.save_torch_tensor( program_folder, program.output_tensors[i], - f"program_{program.index}_output_{i}.pt", + f"output_{i}.pt", ) if query != None: diff --git a/runtime/tools/python/ttrt/runtime/module.cpp b/runtime/tools/python/ttrt/runtime/module.cpp index 47b42eab5..2efdd1a03 100644 --- a/runtime/tools/python/ttrt/runtime/module.cpp +++ b/runtime/tools/python/ttrt/runtime/module.cpp @@ -222,4 +222,12 @@ PYBIND11_MODULE(_C, m) { &tt::runtime::ttnn::test::getHostRowMajorLayout, py::arg("dtype"), "Get host row major layout"); #endif + + /** + * Cleanup code to force a well ordered destruction w.r.t. the GIL + */ + auto cleanup_callback = []() { + ::tt::runtime::debug::Hooks::get().unregisterHooks(); + }; + m.add_object("_cleanup", py::capsule(cleanup_callback)); }