From ec1ba6465191eaf0708a92ff72da30414fb62984 Mon Sep 17 00:00:00 2001 From: Ruonan Wang <105281011+rnwang04@users.noreply.github.com> Date: Wed, 14 Sep 2022 08:34:26 +0800 Subject: [PATCH] Nano : Enhancement for output format of InferenceOptimizer (#5705) * first commit * improve output of optimize * fix method_type for method lack of dependency * update openvino model for thread num * add thread num for trace and quantize * update based on comment: modify output, add progress bar and remove method type * modify latency to latency(ms) * add thread_num for Trainer.trace and Trainer.quantize * fix bug of openvino cpu_num --- .../resnet/inference_pipeline.py | 2 +- .../bigdl/nano/deps/openvino/core/model.py | 10 +- .../bigdl/nano/deps/openvino/openvino_api.py | 9 +- .../bigdl/nano/deps/openvino/pytorch/model.py | 7 +- .../bigdl/nano/pytorch/inference/optimizer.py | 157 +++++++++++++----- .../src/bigdl/nano/pytorch/trainer/Trainer.py | 10 ++ .../tests/test_inference_pipeline_ipex.py | 6 +- 7 files changed, 149 insertions(+), 52 deletions(-) diff --git a/python/nano/example/pytorch/inference_pipeline/resnet/inference_pipeline.py b/python/nano/example/pytorch/inference_pipeline/resnet/inference_pipeline.py index ca6d8534ca2..aeeb2d47638 100644 --- a/python/nano/example/pytorch/inference_pipeline/resnet/inference_pipeline.py +++ b/python/nano/example/pytorch/inference_pipeline/resnet/inference_pipeline.py @@ -45,7 +45,7 @@ def accuracy(pred, target): validation_data=datamodule.val_dataloader(limit_num_samples=160), metric=accuracy, direction="max", - cpu_num=1, + thread_num=1, latency_sample_num=30) # 4. Get the best model under specific restrictions or without restrictions diff --git a/python/nano/src/bigdl/nano/deps/openvino/core/model.py b/python/nano/src/bigdl/nano/deps/openvino/core/model.py index b583cdb9fba..76e85eaf73c 100644 --- a/python/nano/src/bigdl/nano/deps/openvino/core/model.py +++ b/python/nano/src/bigdl/nano/deps/openvino/core/model.py @@ -22,9 +22,10 @@ class OpenVINOModel: - def __init__(self, ie_network: str, device='CPU'): + def __init__(self, ie_network: str, device='CPU', thread_num=None): self._ie = Core() self._device = device + self.thread_num = thread_num self.ie_network = ie_network def forward_step(self, *inputs): @@ -47,8 +48,13 @@ def ie_network(self, model): self._ie_network = self._ie.read_model(model=str(model)) else: self._ie_network = model + if self.thread_num is not None: + config = {"CPU_THREADS_NUM": str(self.thread_num)} + else: + config = {} self._compiled_model = self._ie.compile_model(model=self.ie_network, - device_name=self._device) + device_name=self._device, + config=config) self._infer_request = self._compiled_model.create_infer_request() input_names = [t.any_name for t in self._ie_network.inputs] self._forward_args = input_names diff --git a/python/nano/src/bigdl/nano/deps/openvino/openvino_api.py b/python/nano/src/bigdl/nano/deps/openvino/openvino_api.py index 7357ad4b1f1..200fe4bcb28 100644 --- a/python/nano/src/bigdl/nano/deps/openvino/openvino_api.py +++ b/python/nano/src/bigdl/nano/deps/openvino/openvino_api.py @@ -16,20 +16,23 @@ from functools import partial -def PytorchOpenVINOModel(model, input_sample=None, logging=True, **export_kwargs): +def PytorchOpenVINOModel(model, input_sample=None, thread_num=None, + logging=True, **export_kwargs): """ Create a OpenVINO model from pytorch. :param model: Pytorch model to be converted to OpenVINO for inference or path to Openvino saved model. :param input_sample: A set of inputs for trace, defaults to None if you have trace before or - model is a LightningModule with any dataloader attached, defaults to None + model is a LightningModule with any dataloader attached, defaults to None. + :param thread_num: a int represents how many threads(cores) is needed for + inference. default: None. :param logging: whether to log detailed information of model conversion. default: True. :param **export_kwargs: will be passed to torch.onnx.export function. :return: PytorchOpenVINOModel model for OpenVINO inference. """ from .pytorch.model import PytorchOpenVINOModel - return PytorchOpenVINOModel(model, input_sample, logging, **export_kwargs) + return PytorchOpenVINOModel(model, input_sample, thread_num, logging, **export_kwargs) def load_openvino_model(path): diff --git a/python/nano/src/bigdl/nano/deps/openvino/pytorch/model.py b/python/nano/src/bigdl/nano/deps/openvino/pytorch/model.py index e8a5783e037..2751b42c312 100644 --- a/python/nano/src/bigdl/nano/deps/openvino/pytorch/model.py +++ b/python/nano/src/bigdl/nano/deps/openvino/pytorch/model.py @@ -26,7 +26,8 @@ class PytorchOpenVINOModel(AcceleratedLightningModule): - def __init__(self, model, input_sample=None, logging=True, **export_kwargs): + def __init__(self, model, input_sample=None, thread_num=None, + logging=True, **export_kwargs): """ Create a OpenVINO model from pytorch. @@ -35,6 +36,8 @@ def __init__(self, model, input_sample=None, logging=True, **export_kwargs): :param input_sample: A set of inputs for trace, defaults to None if you have trace before or model is a LightningModule with any dataloader attached, defaults to None. + :param thread_num: a int represents how many threads(cores) is needed for + inference. default: None. :param logging: whether to log detailed information of model conversion. default: True. :param **export_kwargs: will be passed to torch.onnx.export function. """ @@ -44,7 +47,7 @@ def __init__(self, model, input_sample=None, logging=True, **export_kwargs): if isinstance(model, torch.nn.Module): export(model, input_sample, str(dir / 'tmp.xml'), logging, **export_kwargs) ov_model_path = dir / 'tmp.xml' - self.ov_model = OpenVINOModel(ov_model_path) + self.ov_model = OpenVINOModel(ov_model_path, thread_num=thread_num) super().__init__(self.ov_model) def on_forward_start(self, inputs): diff --git a/python/nano/src/bigdl/nano/pytorch/inference/optimizer.py b/python/nano/src/bigdl/nano/pytorch/inference/optimizer.py index 30c2b7135ad..eabcdaa20d4 100644 --- a/python/nano/src/bigdl/nano/pytorch/inference/optimizer.py +++ b/python/nano/src/bigdl/nano/pytorch/inference/optimizer.py @@ -85,11 +85,11 @@ def get_accelerator(self): "int8": AccelerationOption(inc=True), "jit_fp32": AccelerationOption(jit=True), "jit_fp32_ipex": AccelerationOption(jit=True, ipex=True), - "jit_fp32_ipex_clast": AccelerationOption(jit=True, ipex=True, - channels_last=True), + "jit_fp32_ipex_channels_last": AccelerationOption(jit=True, ipex=True, + channels_last=True), "openvino_fp32": AccelerationOption(openvino=True), "openvino_int8": AccelerationOption(openvino=True, pot=True), - "onnxruntime_fp32": AccelerationOption(onnxtunrime=True), + "onnxruntime_fp32": AccelerationOption(onnxruntime=True), "onnxruntime_int8_qlinear": AccelerationOption(onnxruntime=True, inc=True, method="qlinear"), "onnxruntime_int8_integer": AccelerationOption(onnxruntime=True, inc=True, @@ -106,13 +106,14 @@ def __init__(self): # optimized_model_dict handles the optimized model and some metadata # in {"method_name": {"latency": ..., "accuracy": ..., "model": ...}} self.optimized_model_dict = {} + self._optimize_result = None def optimize(self, model: nn.Module, training_data: DataLoader, validation_data: DataLoader = None, metric: Callable = None, direction: str = "max", - cpu_num: int = None, + thread_num: int = None, logging: bool = False, latency_sample_num: int = 100) -> None: ''' @@ -135,7 +136,7 @@ def optimize(self, model: nn.Module, :param direction: (optional) A string that indicates the higher/lower better for the metric, "min" for the lower the better and "max" for the higher the better. Default value is "max". - :param cpu_num: (optional) a int represents how many cores is needed for + :param thread_num: (optional) a int represents how many threads(cores) is needed for inference. :param logging: whether to log detailed information of model conversion. default: False. @@ -160,24 +161,21 @@ def optimize(self, model: nn.Module, self._calculate_accuracy = False default_threads: int = torch.get_num_threads() - cpu_num: int = default_threads if cpu_num is None else int(cpu_num) - - # set cpu num for onnxruntime - if _onnxruntime_checker(): - import onnxruntime - sessoption = onnxruntime.SessionOptions() - sessoption.intra_op_num_threads = cpu_num - sessoption.inter_op_num_threads = cpu_num - else: - sessoption = None - # TODO: set cpu num for openvino + thread_num: int = default_threads if thread_num is None else int(thread_num) result_map: Dict[str, Dict] = {} - model.eval() # change model to eval state + model.eval() # change model to eval mode - for method, available in available_dict.items(): - if available: + print("==========================Start Optimization==========================") + start_time = time.perf_counter() + for idx, (method, available) in enumerate(available_dict.items()): + result_map[method] = {} + if available is False: + result_map[method]["status"] = "lack dependency" + else: + print(f"----------Start test {method} model " + f"({idx+1}/{len(ALL_INFERENCE_ACCELERATION_METHOD)})----------") option: AccelerationOption = ALL_INFERENCE_ACCELERATION_METHOD[method] use_ipex: bool = option.ipex use_channels_last: bool = option.channels_last @@ -203,11 +201,13 @@ def optimize(self, model: nn.Module, InferenceOptimizer.trace(model=model, accelerator=accelerator, input_sample=input_sample, - onnxruntime_session_options=sessoption, + thread_num=thread_num, # remove output of openvino logging=logging) except Exception as e: print(e) + result_map[method]["status"] = "fail to convert" + print(f"----------Failed to convert to {method}----------") continue # if precision is int8 or bf16, then we will use quantize method @@ -221,25 +221,28 @@ def optimize(self, model: nn.Module, use_ipex=use_ipex, calib_dataloader=training_data, method=ort_method, - onnxruntime_session_options=sessoption, + thread_num=thread_num, # remove output of openvino logging=logging) except Exception as e: print(e) + result_map[method]["status"] = "fail to convert" + print(f"----------Failed to convert to {method}----------") continue - result_map[method] = {} + result_map[method]["status"] = "successful" def func_test(model, input_sample): - model(*input_sample) + with torch.no_grad(): + model(*input_sample) - torch.set_num_threads(cpu_num) + torch.set_num_threads(thread_num) try: result_map[method]["latency"] =\ _throughput_calculate_helper(latency_sample_num, func_test, acce_model, input_sample) except Exception as e: - result_map.pop(method) + result_map[method]["status"] = "fail to forward" torch.set_num_threads(default_threads) continue @@ -252,19 +255,26 @@ def func_test(model, input_sample): result_map[method]["accuracy"] = None result_map[method]["model"] = acce_model - else: - pass + print(f"----------Finish test {method} model " + f"({idx+1}/{len(ALL_INFERENCE_ACCELERATION_METHOD)})----------") self.optimized_model_dict: Dict = result_map - print("==========================Optimization Results==========================") - if self._calculate_accuracy: - for key, value in self.optimized_model_dict.items(): - print("accleration option: {}, latency: {:.4f}ms, accuracy : {:.4f}" - .format(key, value["latency"], value["accuracy"])) - else: - for key, value in self.optimized_model_dict.items(): - print("accleration option: {}, latency: {:.4f}ms :" - .format(key, value["latency"])) + print("\n\n==========================Optimization Results==========================") + + self._optimize_result = _format_optimize_result(self.optimized_model_dict, + self._calculate_accuracy) + print(self._optimize_result) + print("Optimization cost {:.3}s at all.".format(time.perf_counter() - start_time)) + print("===========================Stop Optimization===========================") + + def summary(self): + ''' + Print format string representation for optimization result + ''' + invalidOperationError(len(self.optimized_model_dict) > 0, + "There is no optimization result. You should call .optimize() " + "before summary()") + print(self._optimize_result) def get_best_model(self, accelerator: str = None, @@ -302,7 +312,7 @@ def get_best_model(self, self.optimized_model_dict["original"]["accuracy"]) for method in self.optimized_model_dict.keys(): - if method == "original": + if method == "original" or self.optimized_model_dict[method]["status"] != "successful": continue option: AccelerationOption = ALL_INFERENCE_ACCELERATION_METHOD[method] result: Dict = self.optimized_model_dict[method] @@ -333,7 +343,7 @@ def get_best_model(self, best_model = result["model"] best_metric = CompareMetric(method, result["latency"], result["accuracy"]) - return best_model, _format_acceleration_info(best_metric.method_name) + return best_model, _format_acceleration_option(best_metric.method_name) @staticmethod def quantize(model: nn.Module, @@ -350,6 +360,7 @@ def quantize(model: nn.Module, timeout: int = None, max_trials: int = None, input_sample=None, + thread_num: int = None, onnxruntime_session_options=None, logging: bool = True, **export_kwargs): @@ -394,6 +405,9 @@ def quantize(model: nn.Module, "timeout=0, max_trials=1" means it will try quantization only once and return satisfying best model. :param input_sample: An input example to convert pytorch model into ONNX/OpenVINO. + :param thread_num: (optional) a int represents how many threads(cores) is needed for + inference, only valid for accelerator='onnxruntime' + or accelerator='openvino'. :param onnxruntime_session_options: The session option for onnxruntime, only valid when accelerator='onnxruntime', otherwise will be ignored. :param logging: whether to log detailed information of model conversion, only valid when @@ -439,10 +453,17 @@ def quantize(model: nn.Module, if input_sample is None: # input_sample can be a dataloader input_sample = calib_dataloader + if onnxruntime_session_options is None: + import onnxruntime + onnxruntime_session_options = onnxruntime.SessionOptions() + if thread_num is not None: + onnxruntime_session_options.intra_op_num_threads = thread_num + onnxruntime_session_options.inter_op_num_threads = thread_num model = InferenceOptimizer.trace( model, input_sample=input_sample, accelerator='onnxruntime', + onnxruntime_session_options=onnxruntime_session_options, **export_kwargs) """ If accelerator==None, quantized model returned should be an object of PytorchModel @@ -470,6 +491,7 @@ def quantize(model: nn.Module, model = InferenceOptimizer.trace(model, input_sample=input_sample, accelerator='openvino', + thread_num=thread_num, logging=logging, **export_kwargs) invalidInputError(type(model).__name__ == 'PytorchOpenVINOModel', @@ -508,6 +530,7 @@ def trace(model: nn.Module, input_sample=None, accelerator: str = None, use_ipex: bool = False, + thread_num: int = None, onnxruntime_session_options=None, logging: bool = True, **export_kwargs): @@ -522,6 +545,9 @@ def trace(model: nn.Module, :param accelerator: The accelerator to use, defaults to None meaning staying in Pytorch backend. 'openvino', 'onnxruntime' and 'jit' are supported for now. :param use_ipex: whether we use ipex as accelerator for inferencing. default: False. + :param thread_num: (optional) a int represents how many threads(cores) is needed for + inference, only valid for accelerator='onnxruntime' + or accelerator='openvino'. :param onnxruntime_session_options: The session option for onnxruntime, only valid when accelerator='onnxruntime', otherwise will be ignored. :param logging: whether to log detailed information of model conversion, only valid when @@ -540,8 +566,14 @@ def trace(model: nn.Module, "but got type {}".format(type(model)) ) if accelerator == 'openvino': # openvino backend will not care about ipex usage - return PytorchOpenVINOModel(model, input_sample, logging, **export_kwargs) + return PytorchOpenVINOModel(model, input_sample, thread_num, logging, **export_kwargs) if accelerator == 'onnxruntime': # onnxruntime backend will not care about ipex usage + if onnxruntime_session_options is None: + import onnxruntime + onnxruntime_session_options = onnxruntime.SessionOptions() + if thread_num is not None: + onnxruntime_session_options.intra_op_num_threads = thread_num + onnxruntime_session_options.inter_op_num_threads = thread_num return PytorchONNXRuntimeModel(model, input_sample, onnxruntime_session_options, **export_kwargs) if accelerator == 'jit' or use_ipex: @@ -623,7 +655,8 @@ def _throughput_calculate_helper(iterrun, func, *args): time_list = [] for i in range(iterrun): st = time.perf_counter() - func(*args) + with torch.no_grad(): + func(*args) end = time.perf_counter() time_list.append(end - st) # at least need 10 iters and try to control calculation @@ -649,7 +682,7 @@ def _accuracy_calculate_helper(model, metric, data): return np.sum(metric_list) / sample_num -def _format_acceleration_info(method_name): +def _format_acceleration_option(method_name: str) -> str: ''' Get a string represation for current method's acceleration option ''' @@ -663,3 +696,45 @@ def _format_acceleration_info(method_name): if len(repr_str) > 0: repr_str = repr_str[:-2] return repr_str + + +def _format_optimize_result(optimize_result_dict: dict, + calculate_accuracy: bool) -> str: + ''' + Get a format string represation for optimization result + ''' + if calculate_accuracy is True: + horizontal_line = " {0} {1} {2} {3}\n" \ + .format("-" * 32, "-" * 22, "-" * 14, "-" * 12) + repr_str = horizontal_line + repr_str += "| {0:^30} | {1:^20} | {2:^12} | {3:^10} |\n" \ + .format("method", "status", "latency(ms)", "accuracy") + repr_str += horizontal_line + for method, result in optimize_result_dict.items(): + status = result["status"] + latency = result.get("latency", "None") + if latency != "None": + latency = round(latency, 3) + accuracy = result.get("accuracy", "None") + if accuracy != "None": + accuracy = round(accuracy, 3) + method_str = f"| {method:^30} | {status:^20} | " \ + f"{latency:^12} | {accuracy:^10} |\n" + repr_str += method_str + repr_str += horizontal_line + else: + horizontal_line = " {0} {1} {2}\n" \ + .format("-" * 32, "-" * 22, "-" * 14) + repr_str = horizontal_line + repr_str += "| {0:^30} | {1:^20} | {2:^12} |\n" \ + .format("method", "status", "latency(ms)") + repr_str += horizontal_line + for method, result in optimize_result_dict.items(): + status = result["status"] + latency = result.get("latency", "None") + if latency != "None": + latency = round(latency, 3) + method_str = f"| {method:^30} | {status:^20} | {latency:^12} |\n" + repr_str += method_str + repr_str += horizontal_line + return repr_str diff --git a/python/nano/src/bigdl/nano/pytorch/trainer/Trainer.py b/python/nano/src/bigdl/nano/pytorch/trainer/Trainer.py index 4ef4a6333d8..da54675124d 100644 --- a/python/nano/src/bigdl/nano/pytorch/trainer/Trainer.py +++ b/python/nano/src/bigdl/nano/pytorch/trainer/Trainer.py @@ -268,6 +268,7 @@ def trace(model: nn.Module, input_sample=None, accelerator: str = None, use_ipex: bool = False, + thread_num: int = None, onnxruntime_session_options=None, logging: bool = True, **export_kwargs): @@ -282,6 +283,9 @@ def trace(model: nn.Module, :param accelerator: The accelerator to use, defaults to None meaning staying in Pytorch backend. 'openvino', 'onnxruntime' and 'jit' are supported for now. :param use_ipex: whether we use ipex as accelerator for inferencing. default: False. + :param thread_num: (optional) a int represents how many threads(cores) is needed for + inference, only valid for accelerator='onnxruntime' + or accelerator='openvino'. :param onnxruntime_session_options: The session option for onnxruntime, only valid when accelerator='onnxruntime', otherwise will be ignored. :param logging: whether to log detailed information of model conversion, only valid when @@ -298,6 +302,7 @@ def trace(model: nn.Module, input_sample=input_sample, accelerator=accelerator, use_ipex=use_ipex, + thread_num=thread_num, onnxruntime_session_options=onnxruntime_session_options, logging=logging, **export_kwargs) @@ -317,6 +322,7 @@ def quantize(model: nn.Module, timeout: int = None, max_trials: int = None, input_sample=None, + thread_num: int = None, onnxruntime_session_options=None, logging: bool = True, **export_kwargs): @@ -361,6 +367,9 @@ def quantize(model: nn.Module, "timeout=0, max_trials=1" means it will try quantization only once and return satisfying best model. :param input_sample: An input example to convert pytorch model into ONNX/OpenVINO. + :param thread_num: (optional) a int represents how many threads(cores) is needed for + inference, only valid for accelerator='onnxruntime' + or accelerator='openvino'. :param onnxruntime_session_options: The session option for onnxruntime, only valid when accelerator='onnxruntime', otherwise will be ignored. :param logging: whether to log detailed information of model conversion, only valid when @@ -382,6 +391,7 @@ def quantize(model: nn.Module, timeout=timeout, max_trials=max_trials, input_sample=input_sample, + thread_num=thread_num, onnxruntime_session_options=onnxruntime_session_options, logging=logging, **export_kwargs) diff --git a/python/nano/test/pytorch/tests/test_inference_pipeline_ipex.py b/python/nano/test/pytorch/tests/test_inference_pipeline_ipex.py index cbbf0fbe769..1d939aa7541 100644 --- a/python/nano/test/pytorch/tests/test_inference_pipeline_ipex.py +++ b/python/nano/test/pytorch/tests/test_inference_pipeline_ipex.py @@ -58,7 +58,7 @@ class TestInferencePipeline(TestCase): num_workers = 0 data_dir = os.path.join(os.path.dirname(__file__), "data") metric = torchmetrics.Accuracy(num_classes=10, top_k=1) - max_epochs = 10 + max_epochs = 5 model = Net() test_loader = create_data_loader(data_dir, 1, num_workers, data_transform, subset=10, shuffle=False) @@ -85,7 +85,7 @@ def test_pipeline_with_metric(self): validation_data=self.test_loader, metric=self.metric, direction="max", - cpu_num=1) + thread_num=1) acc_model, option = inference_opt.get_best_model() acc_model, option = inference_opt.get_best_model(accelerator="onnxruntime") @@ -99,7 +99,7 @@ def test_pipeline_without_metric(self): inference_opt = InferenceOptimizer() inference_opt.optimize(model=self.model, training_data=self.train_loader, - cpu_num=1) + thread_num=1) acc_model, option = inference_opt.get_best_model() acc_model, option = inference_opt.get_best_model(accelerator="onnxruntime")