diff --git a/.azure-pipelines/model-test-3x.yml b/.azure-pipelines/model-test-3x.yml index 2be418aa8c3..724b81f95bd 100644 --- a/.azure-pipelines/model-test-3x.yml +++ b/.azure-pipelines/model-test-3x.yml @@ -10,6 +10,7 @@ pr: include: - neural_compressor/common - neural_compressor/torch + - examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm - setup.py - requirements_pt.txt - .azure-pipelines/scripts/models diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index b1ee3301701..697e70799c4 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -64,6 +64,7 @@ subprojects: paths: - "neural_compressor/common/**" - "neural_compressor/torch/**" + - "examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/**" - "setup.py" - "requirements_pt.txt" - ".azure-pipelines/scripts/models/**" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt index ebea194b93b..f0b56e558d3 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt @@ -9,5 +9,5 @@ wandb einops neural-compressor intel-extension-for-transformers -lm-eval +lm_eval==0.4.2 peft diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index bc973d28491..090474f4356 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -51,7 +51,7 @@ parser.add_argument("--calib_iters", default=512, type=int, help="calibration iters.") parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", - type=str, help="tasks list for accuracy validation") + type=str, help="tasks for accuracy validation") parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") # ============SmoothQuant configs============== parser.add_argument("--sq", action="store_true") @@ -372,39 +372,36 @@ def run_fn(model): ) user_model.save(args.output_dir) -if args.int8 or args.int8_bf16_mixed: - print("load int8 model") - from neural_compressor.torch.algorithms.static_quant import load +# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result +# if args.int8 or args.int8_bf16_mixed: +# print("load int8 model") - if args.ipex: - user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) - else: - # TODO: WOQ save&load - print("Int8 model loading does not support WeightOnlyQuant now.") - pass -else: - user_model, _ = get_user_model() +# # TODO: from neural_compressor.torch.quantization import load +# from neural_compressor.torch.algorithms.static_quant import load + +# if args.ipex: +# user_model = load(os.path.abspath(os.path.expanduser(args.output_dir))) +# else: +# # TODO: WOQ save&load +# print("Int8 model loading does not support WeightOnlyQuant now.") +# pass +# else: +# user_model, _ = get_user_model() if args.accuracy: user_model.eval() from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( - model="hf", - model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32', + model="hf", user_model=user_model, - tokenizer = tokenizer, + tokenizer=tokenizer, batch_size=args.batch_size, tasks=args.tasks, device="cpu", ) results = evaluate(eval_args) - - dumped = json.dumps(results, indent=2) - if args.save_accuracy_path: - with open(args.save_accuracy_path, "w") as f: - f.write(dumped) for task_name in args.tasks.split(","): if task_name == "wikitext": acc = results["results"][task_name]["word_perplexity,none"] @@ -415,16 +412,14 @@ def run_fn(model): if args.performance: user_model.eval() - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser import time samples = args.iters * args.batch_size - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( - model="hf", - model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32', + model="hf", user_model=user_model, - tokenizer = tokenizer, + tokenizer=tokenizer, batch_size=args.batch_size, tasks=args.tasks, limit=samples, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/requirements.txt index f27b0a5f9c2..e129cb6dc91 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/requirements.txt @@ -1,7 +1,7 @@ accelerate datasets einops -intel_extension_for_transformers +intel-extension-for-transformers optimum peft sentencepiece @@ -10,4 +10,4 @@ torch tqdm tiktoken transformers_stream_generator -git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2 +lm_eval==0.4.2 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py index 14f70ff3d9e..5b34ae79382 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py @@ -51,7 +51,6 @@ def skip(*args, **kwargs): from timers import CPUTimer, GPUTimer from neural_compressor.training import WeightPruningConfig, prepare_pruning from neural_compressor.compression.pruner import (parse_auto_slim_config) -from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate check_min_version("4.27.0.dev0") logger = logging.getLogger(__name__) @@ -271,8 +270,8 @@ def parse_args(): help="Transformers parameter: use the external repo") # Evaluation config - parser.add_argument("--tasks", default=["lambada_openai"], - help="Usually chosen with ['lambada_openai','hellaswag','winogrande','piqa']", + parser.add_argument("--tasks", default="lambada_openai", + type=str, help="tasks for accuracy validation", ) parser.add_argument("--use_accelerate", action='store_true', help="Usually use to accelerate evaluation for large models" @@ -588,14 +587,17 @@ def group_texts(examples): model_args = f'pretrained={model_name},tokenizer={model_name},dtype={dtype},use_accelerate={args.use_accelerate},trust_remote_code={args.trust_remote_code}' eval_batch = args.per_device_eval_batch_size user_model = None if args.use_accelerate else model - results = evaluate( - model="hf-causal", - model_args=model_args, - user_model=user_model, - batch_size=eval_batch, - tasks=args.tasks, - device=device, - ) + + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=eval_batch, + tasks=args.tasks, + device=device, + ) + results = evaluate(eval_args) if __name__ == "__main__": main() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt index f409996c8b7..fe73842a104 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt @@ -10,4 +10,4 @@ wandb einops neural-compressor intel-extension-for-transformers -git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2 +lm_eval==0.4.2 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index 056ebb547ac..ce4b7f9ab4f 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -50,9 +50,8 @@ help="Pad input ids to max length.") parser.add_argument("--calib_iters", default=512, type=int, help="calibration iters.") -parser.add_argument("--tasks", nargs='+', default=["lambada_openai", - "hellaswag", "winogrande", "piqa", "wikitext"], - type=str, help="tasks list for accuracy validation, text-generation and code-generation tasks are different.") +parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", + type=str, help="tasks for accuracy validation, text-generation and code-generation tasks are different.") parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") # ============SmoothQuant configs============== parser.add_argument("--sq", action="store_true") @@ -351,62 +350,82 @@ def eval_func(model): if args.accuracy: user_model.eval() if args.code_generation: - from intel_extension_for_transformers.llm.evaluation.lm_code_eval import evaluate + from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) results = evaluate( model=user_model, tokenizer=tokenizer, - tasks=",".join(args.tasks), + tasks=args.tasks, batch_size=args.batch_size, args=args, ) + for task_name in args.tasks: + if task_name == "truthfulqa_mc": + acc = results["results"][task_name]["mc1"] + else: + acc = results["results"][task_name]["acc"] else: - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate - results = evaluate( - model="hf-causal", - model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32', + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", user_model=user_model, + tokenizer=tokenizer, batch_size=args.batch_size, tasks=args.tasks, + device="cpu", ) + results = evaluate(eval_args) + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] - dumped = json.dumps(results, indent=2) - if args.save_accuracy_path: - with open(args.save_accuracy_path, "w") as f: - f.write(dumped) - for task_name in args.tasks: - if task_name == "wikitext": - acc = results["results"][task_name]["word_perplexity"] - else: - acc = results["results"][task_name]["acc"] print("Accuracy: %.5f" % acc) print('Batch size = %d' % args.batch_size) if args.performance: - user_model.eval() - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate import time - + user_model.eval() samples = args.iters * args.batch_size - start = time.time() - results = evaluate( - model="hf-causal", - model_args='pretrained=' + args.model + ',tokenizer=' + args.model \ - + ',dtype=float32' + ",trust_remote_code=" + str(args.trust_remote_code), - user_model=user_model, - batch_size=args.batch_size, - tasks=args.tasks, - limit=samples, - ) - end = time.time() - for task_name in args.tasks: - if task_name == "wikitext": - acc = results["results"][task_name]["word_perplexity"] - elif task_name == "truthfulqa_mc": - acc = results["results"][task_name]["mc1"] - else: - acc = results["results"][task_name]["acc"] + + if args.code_generation: + from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) + start = time.time() + results = evaluate( + model=user_model, + tokenizer=tokenizer, + tasks=args.tasks, + batch_size=args.batch_size, + args=args, + ) + end = time.time() + for task_name in args.tasks: + if task_name == "truthfulqa_mc": + acc = results["results"][task_name]["mc1"] + else: + acc = results["results"][task_name]["acc"] + else: + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + device="cpu", + ) + start = time.time() + results = evaluate(eval_args) + end = time.time() + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] print("Accuracy: %.5f" % acc) print('Throughput: %.3f samples/sec' % (samples / (end - start))) print('Latency: %.3f ms' % ((end - start) * 1000 / samples))