diff --git a/nncf/openvino/quantization/compression_primitives.py b/nncf/openvino/quantization/compression_primitives.py index 8bb268867a5..961a612021b 100644 --- a/nncf/openvino/quantization/compression_primitives.py +++ b/nncf/openvino/quantization/compression_primitives.py @@ -256,8 +256,8 @@ def _get_compress_model( compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights") - INT8_OUTPUT = bool(int(os.environ.get("INT8_OUTPUT", "0"))) - if INT8_OUTPUT: + FP32_OUTPUT = bool(int(os.environ.get("FP32_OUTPUT", "0"))) + if not FP32_OUTPUT: compressed_w = opset.convert(compressed_w, dtype) results = [compressed_w] @@ -272,8 +272,8 @@ def _get_compress_model( compiled_model = ov.compile_model(model, device_name="CPU") - NOT_SHARED_OUTPUTS = bool(int(os.environ.get("NOT_SHARED_OUTPUTS", "0"))) - return compiled_model, lambda parameters: compiled_model(parameters, share_outputs=not NOT_SHARED_OUTPUTS) + SHARE_OUTPUTS = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) + return compiled_model, lambda parameters: compiled_model(parameters, share_outputs=SHARE_OUTPUTS) @staticmethod def _get_compress_decompress_model( diff --git a/run_weight_compression.py b/run_weight_compression.py new file mode 100644 index 00000000000..f4219362c54 --- /dev/null +++ b/run_weight_compression.py @@ -0,0 +1,125 @@ +import os +import shutil +import subprocess +import threading +import time +from pathlib import Path + + +def stream_handler(stream, target_file): + for line in iter(stream.readline, ''): + print(line, end='') + target_file.write(line) + + +parent_model_dir = Path("/home/nsavel/workspace/openvino.genai/llm_bench/python/models") +parent_log_dir = Path("compression_logs") + +experiment_params = [ + (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "tmp", "--numpy"), + + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "release_memory_att3/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "release_memory_att3/tiny-llama", "--end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "release_memory_att3/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "release_memory_att3/tiny-llama", "--end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "release_memory_att3/tiny-llama", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "release_memory_att3/tiny-llama", "--end-to-end --dynamic --recompile --input-dtype fp32"), + # + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "release_memory_att3/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "release_memory_att3/phi3", "--end-to-end --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "release_memory_att3/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "release_memory_att3/phi3", "--end-to-end --release-memory"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "release_memory_att3/phi3", "--numpy"), + # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "release_memory_att3/phi3", "--end-to-end --dynamic --recompile --input-dtype fp32"), + # + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "release_memory_att3/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "release_memory_att3/llama3-8b", "--end-to-end --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "release_memory_att3/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "release_memory_att3/llama3-8b", "--end-to-end --release-memory"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "release_memory_att3/llama3-8b", "--numpy"), + # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "release_memory_att3/llama3-8b", "--end-to-end --dynamic --recompile --input-dtype fp32"), + + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end --release-memory --share-outputs"), + # + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --end-to-end --release-memory --share-outputs"), + # + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--input-dtype fp32 --numpy"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--input-dtype fp32 --end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--input-dtype fp32 --end-to-end --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--input-dtype fp32 --end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--input-dtype fp32 --end-to-end --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--input-dtype fp32 --end-to-end --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--input-dtype fp32 --end-to-end --release-memory --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --input-dtype fp32 --end-to-end"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --input-dtype fp32 --end-to-end --recompile"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --input-dtype fp32 --end-to-end --release-memory"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --input-dtype fp32 --end-to-end --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --input-dtype fp32 --end-to-end --recompile --share-outputs"), + # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile-vs-release_tiny-llama_att3", "--dynamic --input-dtype fp32 --end-to-end --release-memory --share-outputs"), +] + +for model_dir, log_dir, params in experiment_params: + model_path = model_dir / "openvino_model.xml" + cmd = f"/home/nsavel/venvs/nncf/bin/python weight_compression.py --model-path {model_path} --log-dir {log_dir} {params}" + + log_dir.mkdir(parents=True, exist_ok=True) + with open(log_dir / "log.txt", "a") as log_file: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + universal_newlines=True, + preexec_fn=os.setsid, + ) + + stdout_thread = threading.Thread(target=stream_handler, args=(process.stdout, log_file)) + stderr_thread = threading.Thread(target=stream_handler, args=(process.stderr, log_file)) + + stdout_thread.start() + stderr_thread.start() + + stdout_thread.join() + stderr_thread.join() + + process.wait() + time.sleep(5) + +evaluated_paths = set() +for _, log_dir, _ in experiment_params: + for model_path in log_dir.rglob("**/*"): + model_path: Path + if model_path.suffix != ".xml": + continue + if model_path.absolute() in evaluated_paths: + continue + evaluated_paths.add(model_path.absolute()) + + model_dir = model_path.parent.absolute() + cmd = f"/home/nsavel/venvs/lm-evaluation-harness/bin/lm_eval --model openvino --model_args pretrained={model_dir},trust_remote_code=True --tasks wikitext --output_path {model_dir}" + process = subprocess.Popen(cmd, shell=True) + process.wait() diff --git a/weight_compression.py b/weight_compression.py index 8e0cbabfb9a..50b09c47254 100644 --- a/weight_compression.py +++ b/weight_compression.py @@ -38,13 +38,13 @@ def parse_arguments(): parser.add_argument("--end-to-end", action="store_true", help="Enable end-to-end OV compression") - parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default="fp32", help="OV model input dtype") + parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default=None, help="OV model input dtype") - parser.add_argument("--int8-output", action="store_true", help="Output in (u)int8") + parser.add_argument("--fp32-output", action="store_true", help="Output in fp32 instead of (u)int8") parser.add_argument("--recompile", action="store_true", help="Recompile model every time") - parser.add_argument("--not-shared-outputs", action="store_true", help="Do not share outputs") + parser.add_argument("--share-outputs", action="store_true", help="Share OV model outputs") parser.add_argument("--save-model", action="store_true", help="Save compressed model") @@ -63,6 +63,19 @@ def log(mm, fz, log_dir): ) +def count_node_dtypes(model): + # Get the main dtype of weight constants + node_count_per_dtype = dict(f32=0, f16=0, bf16=0) + for node in model.get_ordered_ops(): + friendly_name = node.get_friendly_name() + if node.get_type_name() != "Constant" or ".weight" not in friendly_name: + continue + const_dtype = node.get_element_type().get_type_name() + if const_dtype in node_count_per_dtype: + node_count_per_dtype[const_dtype] = node_count_per_dtype[const_dtype] + 1 + return node_count_per_dtype + + def main(args): model_path = Path(args.model_path) log_dir = Path(args.log_dir) @@ -71,26 +84,32 @@ def main(args): dynamic_compression = args.dynamic end_to_end_compression = args.end_to_end input_dtype = args.input_dtype - int8_output = args.int8_output + fp32_output = args.fp32_output recompile = args.recompile - not_shared_outputs = args.not_shared_outputs + share_outputs = args.share_outputs save_model = args.save_model compare_with_numpy = args.compare_with_numpy invert_numpy_division = args.invert_numpy_division release_memory = args.release_memory + + log_dir_suffix = f"{model_path.parent.name}_" if numpy_compression: - log_dir_suffix = "numpy" + log_dir_suffix = f"{log_dir_suffix}numpy" if invert_numpy_division: log_dir_suffix += "_inverted" else: - log_dir_suffix = "end-to-end_" if end_to_end_compression else "" + log_dir_suffix = f"{log_dir_suffix}end-to-end_" if end_to_end_compression else "" log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}" - log_dir_suffix = f"{log_dir_suffix}_{'output-int8' if int8_output else 'output-fp32'}" - log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}" + log_dir_suffix = f"{log_dir_suffix}_{'output-fp32' if fp32_output else 'output-i8'}" + if input_dtype is not None: + log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}" if recompile: log_dir_suffix = f"{log_dir_suffix}_recompile" - if not_shared_outputs: - log_dir_suffix = f"{log_dir_suffix}_not-shared-outputs" + if release_memory: + log_dir_suffix = f"{log_dir_suffix}_release-memory" + if share_outputs: + log_dir_suffix = f"{log_dir_suffix}_share-outputs" + print(f"Log dir suffix: {log_dir_suffix}") memory_monitors = [] for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]: @@ -102,13 +121,22 @@ def main(args): # core.set_property({"ENABLE_MMAP": "NO"}) model = core.read_model(model_path) + node_count_per_dtype = count_node_dtypes(model) + assert max(node_count_per_dtype.values()) == sum(node_count_per_dtype.values()), "Not all consts have the same type" + node_count_per_dtype = sorted([(v, k) for k, v in node_count_per_dtype.items()], reverse=True) + model_dtype = dict(f32="fp32", f16="fp16", bf16="bf16")[node_count_per_dtype[0][1]] + + # Update input dtype based on model + if input_dtype is None: + input_dtype = "fp32" if model_dtype == "bf16" else model_dtype + os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}" os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}" os.environ["END_TO_END_COMPRESSION"] = f"{int(end_to_end_compression)}" os.environ["INPUT_DTYPE"] = input_dtype - os.environ["INT8_OUTPUT"] = f"{int(int8_output)}" + os.environ["FP32_OUTPUT"] = f"{int(fp32_output)}" os.environ["RECOMPILE"] = f"{int(recompile)}" - os.environ["NOT_SHARED_OUTPUTS"] = f"{int(not_shared_outputs)}" + os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}" os.environ["COMPARE_WITH_NUMPY"] = f"{int(compare_with_numpy)}" os.environ["INVERT_NUMPY_DIVISION"] = f"{int(invert_numpy_division)}" os.environ["RELEASE_MEMORY"] = f"{int(release_memory)}" @@ -157,8 +185,12 @@ def main(args): if not csv_exists: f.write( "Model Path," + "Model dtype," "Backend," - "End-to-end," + "End to end," + "Recompile," + "Release memory," + "Share outputs," "Input Shapes," "Input," "Output," @@ -170,11 +202,15 @@ def main(args): ) f.write( f"{model_path}," + f"{model_dtype.upper()}," f"{'NumPy' if numpy_compression else 'OV'}," - f"{end_to_end_compression}," + f"{'-' if numpy_compression else end_to_end_compression}," + f"{'-' if numpy_compression else recompile}," + f"{'-' if numpy_compression else release_memory}," + f"{'-' if numpy_compression else share_outputs}," f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'}," f"{'-' if numpy_compression else input_dtype.upper()}," - f"{'-' if numpy_compression else 'INT8' if int8_output else 'FP32'}," + f"{'-' if numpy_compression else 'FP32' if fp32_output else 'INT8'}," f"{compression_time:.2f}," f"{peak_memory:.2f}," f"{cache_size:.2f},"