Skip to content

Commit

Permalink
Added support of HF and GenAI models into CLI (#887)
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexKoff88 authored Sep 21, 2024
1 parent de77f96 commit eecf70f
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 35 deletions.
10 changes: 6 additions & 4 deletions .github/workflows/llm_bench-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ jobs:
python -m pip install --upgrade pip
python -m pip install flake8 pytest black
GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt
pip install openvino-nightly
python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url
https://storage.openvinotoolkit.org/simple/wheels/nightly
GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt
GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }}

Expand Down Expand Up @@ -73,7 +74,7 @@ jobs:
python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./llm_bench/python/prompts/stable-diffusion.jsonl -d cpu -n 1
- name: WWB Tests
run: |
python -m pytest ./llm_bench/python/who_what_benchmark/tests
python -m pytest llm_bench/python/who_what_benchmark/tests
stateful:
runs-on: ubuntu-20.04
steps:
Expand All @@ -85,12 +86,13 @@ jobs:
run: |
GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r llm_bench/python/requirements.txt
python -m pip uninstall --yes openvino
python -m pip install openvino-nightly
python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url
https://storage.openvinotoolkit.org/simple/wheels/nightly
python llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful
grep beam_idx pytorch/dldt/FP32/openvino_model.xml
- name: WWB Tests
run: |
GIT_CLONE_PROTECTION_ACTIVE=false pip install -r llm_bench/python/who_what_benchmark/requirements.txt
GIT_CLONE_PROTECTION_ACTIVE=false pip install llm_bench/python/who_what_benchmark/
pip install pytest
python -m pytest llm_bench/python/who_what_benchmark/tests
python -m pytest llm_bench/python/who_what_benchmark/tests
25 changes: 17 additions & 8 deletions llm_bench/python/who_what_benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,27 +55,36 @@ metrics_per_prompt, metrics = evaluator.score(optimized_model, test_data=prompts
```sh
wwb --help

# run ground truth generation for uncompressed model on the first 32 samples from squad dataset
# ground truth will be saved in llama_2_7b_squad_gt.csv file
# Run ground truth generation for uncompressed model on the first 32 samples from squad dataset
# Ground truth will be saved in llama_2_7b_squad_gt.csv file
wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_squad_gt.csv --dataset squad --split validation[:32] --dataset-field question

# run comparison with compressed model on the first 32 samples from squad dataset
# Run comparison with compressed model on the first 32 samples from squad dataset
wwb --target-model /home/user/models/Llama_2_7b_chat_hf_int8 --gt-data llama_2_7b_squad_gt.csv --dataset squad --split validation[:32] --dataset-field question

# output will be like this
# Output will be like this
# similarity FDT SDT FDT norm SDT norm
# 0 0.972823 67.296296 20.592593 0.735127 0.151505

# run ground truth generation for uncompressed model on internal set of questions
# ground truth will be saved in llama_2_7b_squad_gt.csv file
# Run ground truth generation for uncompressed model on internal set of questions
# Ground truth will be saved in llama_2_7b_squad_gt.csv file
wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv

# run comparison with compressed model on internal set of questions
# Run comparison with compressed model on internal set of questions
wwb --target-model /home/user/models/Llama_2_7b_chat_hf_int8 --gt-data llama_2_7b_wwb_gt.csv

## Control the number of samples and use verbose mode to see the difference in the results
# Use --num-samples to control the number of samples
wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv --num-samples 10

# Use -v for verbose mode to see the difference in the results
wwb --target-model /home/user/models/Llama_2_7b_chat_hf_int8 --gt-data llama_2_7b_wwb_gt.csv --num-samples 10 -v

# Use --hf AutoModelForCausalLM to instantiate the model from model_id/folder
wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv --hf

# Use --language parameter to control the language of promts
# Autodetection works for basic Chinese models
wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv --hf
```

### Supported metrics
Expand Down
6 changes: 4 additions & 2 deletions llm_bench/python/who_what_benchmark/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
transformers>=4.35.2
sentence-transformers>=2.2.2
openvino>=2023.3.0
openvino-telemetry>=2023.2.1
openvino>=2024.3.0
openvino-telemetry>=2024.3.0
optimum-intel>=1.14
openvino-tokenizers>=2024.3.0
openvino-genai>=2024.3.0
pandas>=2.0.3
numpy>=1.23.5
tqdm>=4.66.1
46 changes: 40 additions & 6 deletions llm_bench/python/who_what_benchmark/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,21 @@ def run_wwb(args):


def setup_module():
from optimum.exporters.openvino.convert import export_tokenizer

logger.info("Create models")
tokenizer = AutoTokenizer.from_pretrained(model_id)
base_model = OVModelForCausalLM.from_pretrained(model_id)
base_model.save_pretrained(base_model_path)
tokenizer.save_pretrained(base_model_path)
export_tokenizer(tokenizer, base_model_path)

target_model = OVModelForCausalLM.from_pretrained(
model_id, quantization_config=OVWeightQuantizationConfig(bits=8)
)
target_model.save_pretrained(target_model_path)
tokenizer.save_pretrained(target_model_path)
export_tokenizer(tokenizer, target_model_path)


def teardown_module():
Expand All @@ -57,9 +61,10 @@ def test_target_model():
"--num-samples", "2",
"--device", "CPU"
])

assert result.returncode == 0
assert "Metrics for model" in result.stdout
assert "## Reference text" not in result.stdout
assert "Metrics for model" in result.stderr
assert "## Reference text" not in result.stderr


@pytest.fixture
Expand All @@ -76,8 +81,6 @@ def test_gt_data():
"--num-samples", "2",
"--device", "CPU"
])
import time
time.sleep(1)
data = pd.read_csv(temp_file_name)
os.remove(temp_file_name)

Expand All @@ -95,7 +98,7 @@ def test_output_directory():
"--output", temp_dir
])
assert result.returncode == 0
assert "Metrics for model" in result.stdout
assert "Metrics for model" in result.stderr
assert os.path.exists(os.path.join(temp_dir, "metrics_per_qustion.csv"))
assert os.path.exists(os.path.join(temp_dir, "metrics.csv"))

Expand All @@ -109,7 +112,7 @@ def test_verbose():
"--verbose"
])
assert result.returncode == 0
assert "## Reference text" in result.stdout
assert "## Diff " in result.stderr


def test_language_autodetect():
Expand All @@ -127,3 +130,34 @@ def test_language_autodetect():

assert result.returncode == 0
assert "马克" in data["questions"].values[0]


def test_hf_model():
with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile:
temp_file_name = tmpfile.name

result = run_wwb([
"--base-model", model_id,
"--gt-data", temp_file_name,
"--num-samples", "2",
"--device", "CPU",
"--hf"
])
data = pd.read_csv(temp_file_name)
os.remove(temp_file_name)

assert result.returncode == 0
assert len(data["questions"].values) == 2


def test_genai_model():
result = run_wwb([
"--base-model", base_model_path,
"--target-model", target_model_path,
"--num-samples", "2",
"--device", "CPU",
"--genai"
])
assert result.returncode == 0
assert "Metrics for model" in result.stderr
assert "## Reference text" not in result.stderr
5 changes: 3 additions & 2 deletions llm_bench/python/who_what_benchmark/whowhatbench/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ def __init__(
max_new_tokens=128,
crop_question=True,
num_samples=None,
language=None
language=None,
gen_answer_fn=None,
) -> None:
assert (
base_model is not None or gt_data is not None
Expand All @@ -116,7 +117,7 @@ def __init__(
self.language = autodetect_language(base_model)

if base_model:
self.gt_data = self._generate_data(base_model)
self.gt_data = self._generate_data(base_model, gen_answer_fn)
else:
self.gt_data = pd.read_csv(gt_data, keep_default_na=False)

Expand Down
78 changes: 65 additions & 13 deletions llm_bench/python/who_what_benchmark/whowhatbench/wwb.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,61 @@
import argparse
import difflib
import os

import json
import pandas as pd
import logging
from datasets import load_dataset
from optimum.exporters import TasksManager
from optimum.intel.openvino import OVModelForCausalLM
from optimum.utils import NormalizedConfigManager, NormalizedTextConfig
from transformers import AutoConfig, AutoTokenizer
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

from . import Evaluator

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

TasksManager._SUPPORTED_MODEL_TYPE["stablelm-epoch"] = TasksManager._SUPPORTED_MODEL_TYPE["llama"]
NormalizedConfigManager._conf["stablelm-epoch"] = NormalizedTextConfig.with_args(
num_layers="num_hidden_layers",
num_attention_heads="num_attention_heads",
)


def load_model(model_id, device="CPU", ov_config=None):
class GenAIModelWrapper():
"""
A helper class to store additional attributes for GenAI models
"""
def __init__(self, model, model_dir):
self.model = model
self.config = AutoConfig.from_pretrained(model_dir)

def __getattr__(self, attr):
if attr in self.__dict__:
return getattr(self, attr)
else:
return getattr(self.model, attr)


def load_genai_pipeline(model_dir, device="CPU"):
try:
import openvino_genai
except ImportError:
logger.error("Failed to import openvino_genai package. Please install it.")
exit(-1)
logger.info("Using OpenVINO GenAI API")
return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device), model_dir)


def load_model(model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False):
if use_hf:
logger.info("Using HF Transformers API")
return AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, device_map=device.lower())

if use_genai:
return load_genai_pipeline(model_id, device)

if ov_config:
with open(ov_config) as f:
ov_options = json.load(f)
Expand Down Expand Up @@ -157,6 +193,16 @@ def parse_args():
default=None,
help="Used to select default prompts based on the primary model language, e.g. 'en', 'ch'.",
)
parser.add_argument(
"--hf",
action="store_true",
help="Use AutoModelForCausalLM from transformers library to instantiate the model.",
)
parser.add_argument(
"--genai",
action="store_true",
help="Use LLMPipeline from transformers library to instantiate the model.",
)

return parser.parse_args()

Expand Down Expand Up @@ -211,6 +257,11 @@ def diff_strings(a: str, b: str, *, use_loguru_colors: bool = False) -> str:
return "".join(output)


def genai_gen_answer(model, tokenizer, question, max_new_tokens, skip_question):
out = model.generate(question, max_new_tokens=max_new_tokens)
return out


def main():
args = parse_args()
check_args(args)
Expand All @@ -228,24 +279,25 @@ def main():
language=args.language,
)
else:
base_model = load_model(args.base_model, args.device, args.ov_config)
base_model = load_model(args.base_model, args.device, args.ov_config, args.hf, args.genai)
evaluator = Evaluator(
base_model=base_model,
test_data=prompts,
tokenizer=tokenizer,
similarity_model_id=args.text_encoder,
num_samples=args.num_samples,
language=args.language,
gen_answer_fn=genai_gen_answer if args.genai else None
)
if args.gt_data:
evaluator.dump_gt(args.gt_data)
del base_model

if args.target_model:
target_model = load_model(args.target_model, args.device, args.ov_config)
all_metrics_per_question, all_metrics = evaluator.score(target_model)
print("Metrics for model: ", args.target_model)
print(all_metrics)
target_model = load_model(args.target_model, args.device, args.ov_config, args.hf, args.genai)
all_metrics_per_question, all_metrics = evaluator.score(target_model, genai_gen_answer if args.genai else None)
logger.info("Metrics for model: %s", args.target_model)
logger.info(all_metrics)

if args.output:
if not os.path.exists(args.output):
Expand All @@ -269,11 +321,11 @@ def main():
actual_text += l2 + "\n"
diff += diff_strings(l1, l2) + "\n"

print("--------------------------------------------------------------------------------------")
print("## Reference text {}:\n".format(i + 1), ref_text)
print("## Actual text {}:\n".format(i + 1), actual_text)
print("## Diff {}: ".format(i + 1))
print(diff)
logger.info("--------------------------------------------------------------------------------------")
logger.info("## Reference text %d:\n%s", i + 1, ref_text)
logger.info("## Actual text %d:\n%s", i + 1, actual_text)
logger.info("## Diff %d: ", i + 1)
logger.info(diff)


if __name__ == "__main__":
Expand Down

0 comments on commit eecf70f

Please sign in to comment.