-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Nano : reduce time cost for InferenceOptimizer and update demo #5740
Changes from 8 commits
e92aa14
683d36a
2cb9c09
275502e
d9fd8da
a2ce597
2294dee
7d9b5df
5e93836
e1ec2b5
0ac8b55
39497d8
5530c7e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,6 +35,12 @@ | |
from bigdl.nano.deps.neural_compressor.inc_api import load_inc_model, quantize as inc_quantize | ||
from bigdl.nano.utils.inference.pytorch.model import AcceleratedLightningModule | ||
from bigdl.nano.pytorch.utils import TORCH_VERSION_LESS_1_10 | ||
import warnings | ||
# Filter out useless Userwarnings | ||
warnings.filterwarnings('ignore', category=UserWarning, module='pytorch_lightning') | ||
warnings.filterwarnings('ignore', category=DeprecationWarning, module='pytorch_lightning') | ||
warnings.filterwarnings('ignore', category=UserWarning, module='torch') | ||
warnings.filterwarnings('ignore', category=DeprecationWarning, module='torch') | ||
|
||
import os | ||
os.environ['LOGLEVEL'] = 'ERROR' # remove parital output of inc | ||
|
@@ -167,6 +173,17 @@ def optimize(self, model: nn.Module, | |
|
||
model.eval() # change model to eval mode | ||
|
||
input_sample = tuple(next(iter(training_data))[:-1]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This basically assume users have only one output and one/multiple input for the model's forward, maybe we can inspect the model and set a more accurate param num There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch, will consider this. |
||
st = time.perf_counter() | ||
try: | ||
with torch.no_grad(): | ||
model(*input_sample) | ||
except Exception: | ||
invalidInputError(False, | ||
"training_data is incompatible with your model input.") | ||
exit(1) | ||
rnwang04 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
baseline_time = time.perf_counter() - st | ||
|
||
print("==========================Start Optimization==========================") | ||
start_time = time.perf_counter() | ||
for idx, (method, available) in enumerate(available_dict.items()): | ||
|
@@ -183,7 +200,6 @@ def optimize(self, model: nn.Module, | |
precision: str = option.get_precision() | ||
# if precision is fp32, then we will use trace method | ||
if precision == "fp32": | ||
input_sample = tuple(next(iter(training_data))[:-1]) | ||
try: | ||
if accelerator is None and use_ipex is False: | ||
acce_model = model | ||
|
@@ -238,19 +254,28 @@ def func_test(model, input_sample): | |
|
||
torch.set_num_threads(thread_num) | ||
try: | ||
result_map[method]["latency"] =\ | ||
_throughput_calculate_helper(latency_sample_num, func_test, | ||
acce_model, input_sample) | ||
result_map[method]["latency"], status =\ | ||
_throughput_calculate_helper(latency_sample_num, baseline_time, | ||
func_test, acce_model, input_sample) | ||
if status is False: | ||
result_map[method]["status"] = "early stopped" | ||
torch.set_num_threads(default_threads) | ||
continue | ||
except Exception as e: | ||
result_map[method]["status"] = "fail to forward" | ||
torch.set_num_threads(default_threads) | ||
continue | ||
|
||
torch.set_num_threads(default_threads) | ||
if self._calculate_accuracy: | ||
result_map[method]["accuracy"] =\ | ||
_accuracy_calculate_helper(acce_model, | ||
metric, validation_data) | ||
# here we suppose trace don't change accuracy, | ||
# so we jump it to reduce time cost of optimize | ||
if precision == "fp32" and method != "original": | ||
result_map[method]["accuracy"] = "not recomputed" | ||
else: | ||
result_map[method]["accuracy"] =\ | ||
_accuracy_calculate_helper(acce_model, | ||
metric, validation_data) | ||
else: | ||
result_map[method]["accuracy"] = None | ||
|
||
|
@@ -329,9 +354,11 @@ def get_best_model(self, | |
continue | ||
|
||
if accuracy_criterion is not None: | ||
accuracy: float = result["accuracy"] | ||
accuracy = result["accuracy"] | ||
compare_acc: float = best_metric.accuracy | ||
if self._direction == "min": | ||
if accuracy == "not recomputed": | ||
pass | ||
elif self._direction == "min": | ||
if (accuracy - compare_acc) / compare_acc > accuracy_criterion: | ||
continue | ||
else: | ||
|
@@ -341,7 +368,11 @@ def get_best_model(self, | |
# After the above conditions are met, the latency comparison is performed | ||
if result["latency"] < best_metric.latency: | ||
best_model = result["model"] | ||
best_metric = CompareMetric(method, result["latency"], result["accuracy"]) | ||
if result["accuracy"] != "not recomputed": | ||
accuracy = result["accuracy"] | ||
else: | ||
accuracy = self.optimized_model_dict["original"]["accuracy"] | ||
best_metric = CompareMetric(method, result["latency"], accuracy) | ||
|
||
return best_model, _format_acceleration_option(best_metric.method_name) | ||
|
||
|
@@ -647,7 +678,7 @@ def _available_acceleration_combination(): | |
return available_dict | ||
|
||
|
||
def _throughput_calculate_helper(iterrun, func, *args): | ||
def _throughput_calculate_helper(iterrun, baseline_time, func, *args): | ||
''' | ||
A simple helper to calculate average latency | ||
''' | ||
|
@@ -659,6 +690,9 @@ def _throughput_calculate_helper(iterrun, func, *args): | |
func(*args) | ||
end = time.perf_counter() | ||
time_list.append(end - st) | ||
# if three samples cost more than 4x time than baseline model, prune it | ||
if i == 2 and end - start_time > 12 * baseline_time: | ||
return np.mean(time_list) * 1000, False | ||
# at least need 10 iters and try to control calculation | ||
# time less than 2 min | ||
if i + 1 >= min(iterrun, 10) and (end - start_time) > 2: | ||
|
@@ -667,7 +701,7 @@ def _throughput_calculate_helper(iterrun, func, *args): | |
time_list.sort() | ||
# remove top and least 10% data | ||
time_list = time_list[int(0.1 * iterrun): int(0.9 * iterrun)] | ||
return np.mean(time_list) * 1000 | ||
return np.mean(time_list) * 1000, True | ||
|
||
|
||
def _accuracy_calculate_helper(model, metric, data): | ||
|
@@ -676,9 +710,10 @@ def _accuracy_calculate_helper(model, metric, data): | |
''' | ||
metric_list = [] | ||
sample_num = 0 | ||
for i, (data_input, target) in enumerate(data): | ||
metric_list.append(metric(model(data_input), target).numpy() * data_input.shape[0]) | ||
sample_num += data_input.shape[0] | ||
with torch.no_grad(): | ||
for i, (data_input, target) in enumerate(data): | ||
metric_list.append(metric(model(data_input), target).numpy() * data_input.shape[0]) | ||
sample_num += data_input.shape[0] | ||
return np.sum(metric_list) / sample_num | ||
|
||
|
||
|
@@ -690,7 +725,10 @@ def _format_acceleration_option(method_name: str) -> str: | |
repr_str = "" | ||
for key, value in option.__dict__.items(): | ||
if value is True: | ||
repr_str = repr_str + key + " + " | ||
if key == "pot": | ||
repr_str = repr_str + "int8" + " + " | ||
else: | ||
repr_str = repr_str + key + " + " | ||
elif isinstance(value, str): | ||
repr_str = repr_str + value + " + " | ||
if len(repr_str) > 0: | ||
|
@@ -705,9 +743,9 @@ def _format_optimize_result(optimize_result_dict: dict, | |
''' | ||
if calculate_accuracy is True: | ||
horizontal_line = " {0} {1} {2} {3}\n" \ | ||
.format("-" * 32, "-" * 22, "-" * 14, "-" * 12) | ||
.format("-" * 32, "-" * 22, "-" * 14, "-" * 22) | ||
repr_str = horizontal_line | ||
repr_str += "| {0:^30} | {1:^20} | {2:^12} | {3:^10} |\n" \ | ||
repr_str += "| {0:^30} | {1:^20} | {2:^12} | {3:^20} |\n" \ | ||
.format("method", "status", "latency(ms)", "accuracy") | ||
repr_str += horizontal_line | ||
for method, result in optimize_result_dict.items(): | ||
|
@@ -716,10 +754,10 @@ def _format_optimize_result(optimize_result_dict: dict, | |
if latency != "None": | ||
latency = round(latency, 3) | ||
accuracy = result.get("accuracy", "None") | ||
if accuracy != "None": | ||
if accuracy != "None" and isinstance(accuracy, float): | ||
accuracy = round(accuracy, 3) | ||
method_str = f"| {method:^30} | {status:^20} | " \ | ||
f"{latency:^12} | {accuracy:^10} |\n" | ||
f"{latency:^12} | {accuracy:^20} |\n" | ||
repr_str += method_str | ||
repr_str += horizontal_line | ||
else: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should not have the user to manually manage this @MeouSker77 @TheaperDeng
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems the default value of
KMP_AFFINITY
inbigdl-nano-init
is wrong, now its default value will cause program use only one core when inferencing, so here unset it to use more cores. I'll fix its default value to use all cores by defaultThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could be a really tricky one. KMP_AFFINITY default value has some conflict with onnxruntime's core resource control (which makes it behave strangely). We cannot unset or reset this sys variable once the user start their python script. Some solution we have used in training is creating a new process(with different KMP_AFFINITY) to handle the work, while it is not reasonable as well since creating a process will cost 1-10ms and very unfriendly to low latency requirement.
One possible solution is:
KMP_AFFINITY
default value be None(that is not setting this value inbigdl-nano-init
)What do you think? @jason-dai @MeouSker77
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now the default value of
KMP_AFFINITY
isgranularity=fine,compact,1,0
. In my test, if remove thecompact
option, i.e., setKMP_AFFINITY
togranularity=fine,1,0
can avoid the conflict with onnxruntime, so it's also a possible solution.Here is the explanation of the
compact
option:Shall we take this solution? @jason-dai @TheaperDeng
By the way, the multi-processes training will set
KMP_AFFINITY
for sub-processes automatically, it won't be affected by this default value.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that should be OK.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good; need to test with our performance suit.