Skip to content

Commit

Permalink
[WWB]: Added ability to compare results for previously collected outp…
Browse files Browse the repository at this point in the history
…uts w/o models provided (#1238)

- Compare outputs collected from the previous runs
- Kept only "similarity" metric by default as the only one that is used
in CI

Example:
```shell
optimum-cli export openvino -m Qwen/Qwen2-0.5B-Instruct --weight-format fp16 models/Qwen2-0.5B-Instruct-fp16

mkdir qwen2_N_FP16

# References from NAT FP16
wwb --base-model Qwen/Qwen2-0.5B-Instruct --gt-data qwen2_N_FP16/gt.csv --hf --num-samples 4

# Compare N_O_FP16, save Optimum data for references
wwb --target-model models/Qwen2-0.5B-Instruct-fp16 --gt-data qwen2_N_FP16/gt.csv --output qwen2_N_O_FP16 --num-samples 4

# Compare N_G_FP16, save GenAI data for references
wwb --target-model  models/Qwen2-0.5B-Instruct-fp16 --gt-data qwen2_N_FP16/gt.csv --genai --output qwen2_N_G_FP16 --num-samples 4

# Compare O_G_FP16, use pre-generated grout truth and target data from the previous runs
wwb --target-data qwen2_N_G_FP16/target.csv --gt-data qwen2_N_O_FP16/target.csv --genai --output qwen2_O_G_FP16 --num-samples 4

# The same for INT8
optimum-cli export openvino -m Qwen/Qwen2-0.5B-Instruct --weight-format int8 models/Qwen2-0.5B-Instruct-int8

# Compare N_G_INT8, save GenAI data for references
wwb --target-model models/Qwen2-0.5B-Instruct-int8 --gt-data qwen2_N_FP16/gt.csv --genai --output qwen2_N_G_INT8 --num-samples 4
```
  • Loading branch information
AlexKoff88 authored Nov 25, 2024
1 parent 18e8d5b commit d490c18
Showing 8 changed files with 298 additions and 231 deletions.
166 changes: 90 additions & 76 deletions tools/who_what_benchmark/tests/test_cli_image.py
Original file line number Diff line number Diff line change
@@ -14,7 +14,6 @@ def run_wwb(args):
logger.info(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args))
result = subprocess.run(["wwb"] + args, capture_output=True, text=True)
logger.info(result)
print(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args))
return result


@@ -27,7 +26,7 @@ def run_wwb(args):
],
)
def test_image_model_types(model_id, model_type, backend):
GT_FILE = "test_sd.json"
GT_FILE = "test_sd.csv"
wwb_args = [
"--base-model",
model_id,
@@ -70,79 +69,94 @@ def test_image_model_types(model_id, model_type, backend):
],
)
def test_image_model_genai(model_id, model_type):
GT_FILE = "test_sd.json"
MODEL_PATH = tempfile.TemporaryDirectory().name

result = subprocess.run(["optimum-cli", "export",
"openvino", "-m", model_id,
MODEL_PATH], capture_output=True, text=True)
assert result.returncode == 0

wwb_args = [
"--base-model",
MODEL_PATH,
"--num-samples",
"1",
"--gt-data",
GT_FILE,
"--device",
"CPU",
"--model-type",
model_type,
]
result = run_wwb(wwb_args)
assert result.returncode == 0
assert os.path.exists(GT_FILE)
assert os.path.exists("reference")

wwb_args = [
"--target-model",
MODEL_PATH,
"--num-samples",
"1",
"--gt-data",
GT_FILE,
"--device",
"CPU",
"--model-type",
model_type,
"--genai",
]
result = run_wwb(wwb_args)

assert result.returncode == 0
assert "Metrics for model" in result.stderr
similarity = float(str(result.stderr).split(" ")[-1])
assert similarity >= 0.98
assert os.path.exists("target")

output_dir = tempfile.TemporaryDirectory().name
wwb_args = [
"--target-model",
MODEL_PATH,
"--num-samples",
"1",
"--gt-data",
GT_FILE,
"--device",
"CPU",
"--model-type",
model_type,
"--output",
output_dir,
]
result = run_wwb(wwb_args)
assert os.path.exists(os.path.join(output_dir, "target"))
assert os.path.exists(os.path.join(output_dir, "target.json"))

try:
os.remove(GT_FILE)
except OSError:
pass
shutil.rmtree("reference", ignore_errors=True)
shutil.rmtree("target", ignore_errors=True)
shutil.rmtree(MODEL_PATH, ignore_errors=True)
shutil.rmtree(output_dir, ignore_errors=True)
with tempfile.TemporaryDirectory() as temp_dir:
GT_FILE = os.path.join(temp_dir, "gt.csv")
MODEL_PATH = os.path.join(temp_dir, model_id.replace("/", "--"))

result = subprocess.run(["optimum-cli", "export",
"openvino", "-m", model_id,
MODEL_PATH],
capture_output=True, text=True)
assert result.returncode == 0

wwb_args = [
"--base-model",
MODEL_PATH,
"--num-samples",
"1",
"--gt-data",
GT_FILE,
"--device",
"CPU",
"--model-type",
model_type,
]
result = run_wwb(wwb_args)
assert result.returncode == 0
assert os.path.exists(GT_FILE)
assert os.path.exists(os.path.join(temp_dir, "reference"))

wwb_args = [
"--target-model",
MODEL_PATH,
"--num-samples",
"1",
"--gt-data",
GT_FILE,
"--device",
"CPU",
"--model-type",
model_type,
"--genai",
]
result = run_wwb(wwb_args)

assert result.returncode == 0
assert "Metrics for model" in result.stderr
similarity = float(str(result.stderr).split(" ")[-1])
assert similarity >= 0.98
assert os.path.exists(os.path.join(temp_dir, "target"))

output_dir = tempfile.TemporaryDirectory().name
wwb_args = [
"--target-model",
MODEL_PATH,
"--num-samples",
"1",
"--gt-data",
GT_FILE,
"--device",
"CPU",
"--model-type",
model_type,
"--output",
output_dir,
]
result = run_wwb(wwb_args)
assert result.returncode == 0
assert os.path.exists(os.path.join(output_dir, "target"))
assert os.path.exists(os.path.join(output_dir, "target.csv"))

# test w/o models
wwb_args = [
"--target-data",
os.path.join(output_dir, "target.csv"),
"--num-samples",
"1",
"--gt-data",
GT_FILE,
"--device",
"CPU",
"--model-type",
model_type,
]
result = run_wwb(wwb_args)
assert result.returncode == 0

shutil.rmtree("reference", ignore_errors=True)
shutil.rmtree("target", ignore_errors=True)
shutil.rmtree(MODEL_PATH, ignore_errors=True)
shutil.rmtree(output_dir, ignore_errors=True)


@pytest.mark.parametrize(
@@ -152,7 +166,7 @@ def test_image_model_genai(model_id, model_type):
],
)
def test_image_custom_dataset(model_id, model_type, backend):
GT_FILE = "test_sd.json"
GT_FILE = "test_sd.csv"
wwb_args = [
"--base-model",
model_id,
128 changes: 71 additions & 57 deletions tools/who_what_benchmark/tests/test_cli_text.py
Original file line number Diff line number Diff line change
@@ -73,29 +73,28 @@ def test_text_target_model():

@pytest.fixture
def test_text_gt_data():
with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile:
temp_file_name = tmpfile.name
with tempfile.TemporaryDirectory() as temp_dir:
temp_file_name = os.path.join(temp_dir, "gt.csv")

result = run_wwb(
[
"--base-model",
base_model_path,
"--gt-data",
temp_file_name,
"--dataset",
"EleutherAI/lambada_openai,en",
"--dataset-field",
"text",
"--split",
"test",
"--num-samples",
"2",
"--device",
"CPU",
]
)
data = pd.read_csv(temp_file_name)
os.remove(temp_file_name)
result = run_wwb(
[
"--base-model",
base_model_path,
"--gt-data",
temp_file_name,
"--dataset",
"EleutherAI/lambada_openai,en",
"--dataset-field",
"text",
"--split",
"test",
"--num-samples",
"2",
"--device",
"CPU",
]
)
data = pd.read_csv(temp_file_name)

assert result.returncode == 0
assert len(data["questions"].values) == 2
@@ -107,6 +106,8 @@ def test_text_output_directory():
[
"--base-model",
base_model_path,
"--gt-data",
os.path.join(temp_dir, "gt.csv"),
"--target-model",
target_model_path,
"--num-samples",
@@ -121,7 +122,23 @@ def test_text_output_directory():
assert "Metrics for model" in result.stderr
assert os.path.exists(os.path.join(temp_dir, "metrics_per_qustion.csv"))
assert os.path.exists(os.path.join(temp_dir, "metrics.csv"))
assert os.path.exists(os.path.join(temp_dir, "target.json"))
assert os.path.exists(os.path.join(temp_dir, "target.csv"))

# test measurtement w/o models
result = run_wwb(
[
"--gt-data",
os.path.join(temp_dir, "gt.csv"),
"--target-data",
os.path.join(temp_dir, "target.csv"),
"--num-samples",
"2",
"--device",
"CPU",
]
)
assert result.returncode == 0
assert "Metrics for model" in result.stderr


def test_text_verbose():
@@ -143,46 +160,43 @@ def test_text_verbose():


def test_text_language_autodetect():
temp_file_name = tempfile.NamedTemporaryFile(suffix=".csv").name

result = run_wwb(
[
"--base-model",
"Qwen/Qwen2-0.5B",
"--gt-data",
temp_file_name,
"--num-samples",
"2",
"--device",
"CPU",
]
)
data = pd.read_csv(temp_file_name)
os.remove(temp_file_name)
with tempfile.TemporaryDirectory() as temp_dir:
temp_file_name = os.path.join(temp_dir, "gt.csv")
result = run_wwb(
[
"--base-model",
"Qwen/Qwen2-0.5B",
"--gt-data",
temp_file_name,
"--num-samples",
"2",
"--device",
"CPU",
]
)
data = pd.read_csv(temp_file_name)

assert result.returncode == 0
assert "马克" in data["prompts"].values[0]


def test_text_hf_model():
with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile:
temp_file_name = tmpfile.name

result = run_wwb(
[
"--base-model",
model_id,
"--gt-data",
temp_file_name,
"--num-samples",
"2",
"--device",
"CPU",
"--hf",
]
)
data = pd.read_csv(temp_file_name)
os.remove(temp_file_name)
with tempfile.TemporaryDirectory() as temp_dir:
temp_file_name = os.path.join(temp_dir, "gt.csv")
result = run_wwb(
[
"--base-model",
model_id,
"--gt-data",
temp_file_name,
"--num-samples",
"2",
"--device",
"CPU",
"--hf",
]
)
data = pd.read_csv(temp_file_name)

assert result.returncode == 0
assert len(data["prompts"].values) == 2
Loading

0 comments on commit d490c18

Please sign in to comment.