Skip to content

Commit

Permalink
add Readme for tests (#664)
Browse files Browse the repository at this point in the history
- Added Readme for python tests
- Added `--model_ids` option to run selectively only on specific models

---------

Co-authored-by: Zlobin Vladimir <[email protected]>
  • Loading branch information
pavel-esir and Wovchena authored Jul 23, 2024
1 parent cb0da0a commit bc92248
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 2 deletions.
47 changes: 47 additions & 0 deletions tests/python_tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# OpenVINO™ GenAI Tests

This tests aim to validate support for vanilla and continuous batching GenAI APIs.

## Setup environemnt

In order to run tests first of all build or install OpenVINO GenAI library, follow instructions [GenAI Library README](../../src/README.md).

Then install requirements for tests:
```sh
pip install -r tests/python_tests/requirements.txt
```

## Run Tests

```sh
python -m pytest tests/python_tests/ -m precommit
```

During the test downloaded HuggingFace (HF) models will be saved into the current directory. If you wish to place them somewhere else you can specify `GENAI_MODELS_PATH_PREFIX` environenment variable, e.g.
```sh
GENAI_MODELS_PATH_PREFIX=$HOME/test_models python -m pytest tests/python_tests/ -m precommit
```

If you have built GenAI library by yourself instead of using wheel please set `PYTHONPATH` so that test could find library, e.g.
```sh
PYTHONPATH=$PYTHONPATH:.../openvino.genai/build-Release/ python -m pytest tests/python_tests/ -m precommit
```

## Customise tests run

Tests have `precommit` and `nightly` set of models. `precommit` contains lightweight models which can be quickly inferred, `nightly` models are heavier and required more time for interence. If you wish to run specific tests only for nightly models, you can use `-k` option, for example to run only multibatch and chat tests:
```sh
python -m pytest tests/python_tests/ -m nightly -k "test_multibatch and test_chat"
```

If you wish to run all tests except beam search do the following:
```sh
python -m pytest tests/python_tests/ -m precommit -k "not test_beam_search"
```

Argument `--model_ids` can be used to run tests selectively only for specific models. HF model ids should be separated by space, e.g:
```sh
python -m pytest tests/python_tests/ -m nightly -k "test_multibatch" --model_ids "TinyLlama/TinyLlama-1.1B-Chat-v1.0 Qwen/Qwen2-0.5B-Instruct"
```

List of currently supported `nightly` and `precommit` models can be found in tests/python_tests/ov_genai_test_utils.py:get_models_list
7 changes: 6 additions & 1 deletion tests/python_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ def pytest_make_parametrize_id(config, val, argname):
return f'{argname}={val}'
return None

def pytest_configure(config):
def pytest_addoption(parser):
parser.addoption("--model_ids", help="Select models to run")

def pytest_configure(config: pytest.Config):
marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly'
pytest.run_marker = marker
pytest.selected_model_ids = config.getoption('--model_ids', default=None)

5 changes: 4 additions & 1 deletion tests/python_tests/ov_genai_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def get_models_list():
model_ids = precommit_models
else:
model_ids = nightly_models


if pytest.selected_model_ids:
model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
# pytest.set_trace()
prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]

Expand Down
4 changes: 4 additions & 0 deletions tests/python_tests/test_chat_generate_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
@pytest.mark.parametrize("generation_config", configs)
@pytest.mark.parametrize("model_descr", get_chat_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_chat_compare_with_HF(model_descr, generation_config: Dict):
device = 'CPU'
chat_history_hf = []
Expand Down Expand Up @@ -69,6 +70,7 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
@pytest.mark.parametrize("generation_config", configs)
@pytest.mark.parametrize("model_descr", get_chat_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict):
# compares with HF when history in ov_genai is save as a text
device = 'CPU'
Expand Down Expand Up @@ -104,6 +106,7 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict)
@pytest.mark.parametrize("generation_config", configs)
@pytest.mark.parametrize("model_descr", get_chat_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict):
# Check that when history is stored in KV cache results are the same as when history stored in a text.
device ='CPU'
Expand Down Expand Up @@ -144,6 +147,7 @@ def test_chat_compare_statefull_vs_text_history(model_descr, generation_config:
{'role': 'user', 'content': 'What was my first question?'},
]
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize('chat_config', get_chat_templates())
def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
tokenizer_config = chat_config[1]
Expand Down
32 changes: 32 additions & 0 deletions tests/python_tests/test_generate_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def hf_ov_genai_tensors_comparison(
@pytest.mark.parametrize("generation_config,prompt", test_cases)
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_decoding(model_descr, generation_config, prompt):
run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)

Expand All @@ -168,6 +169,7 @@ def test_decoding(model_descr, generation_config, prompt):
condition=sys.platform == "linux"
)
@pytest.mark.precommit
@pytest.mark.nightly
def test_ov_tensors(model_descr, inputs):
hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs)

Expand All @@ -182,6 +184,7 @@ def test_ov_tensors(model_descr, inputs):
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.parametrize("prompt", prompts)
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.xfail(
raises=TypeError,
reason="pybind was unable to find ov::Tensor from openvino yet",
Expand Down Expand Up @@ -217,6 +220,7 @@ def test_genai_tokenizer_encode(model_descr, prompt):
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.parametrize("encoded_prompt", encoded_prompts)
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.xfail(
raises=TypeError,
reason="pybind was unable to find ov::Tensor from openvino yet",
Expand Down Expand Up @@ -252,6 +256,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt):
@pytest.mark.parametrize("prompts", batched_prompts)
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_multibatch(model_descr, generation_config, prompts):
run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)

Expand All @@ -264,6 +269,7 @@ def test_multibatch(model_descr, generation_config, prompts):
@pytest.mark.parametrize("prompt", prompts)
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
max_new_tokens, diversity_penalty, prompt):
generation_config = dict(
Expand All @@ -281,6 +287,7 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
@pytest.mark.parametrize("max_new_tokens", [10, 80])
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
# todo: with EARLY stop_criteria looks like HF return unvalid out with sentence<eos><unk><unk>
# while genai ends sentence with <eos>
Expand Down Expand Up @@ -323,6 +330,7 @@ def user_defined_callback(subword):

@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@pytest.mark.precommit
@pytest.mark.nightly
def test_callback_one_string(callback):
pipe = read_model(get_models_list()[0])[4]
generation_config = pipe.get_generation_config()
Expand All @@ -332,6 +340,7 @@ def test_callback_one_string(callback):

@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@pytest.mark.precommit
@pytest.mark.nightly
def test_callback_batch_fail(callback):
pipe = read_model(get_models_list()[0])[4]
with pytest.raises(RuntimeError):
Expand All @@ -340,12 +349,14 @@ def test_callback_batch_fail(callback):

@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@pytest.mark.precommit
@pytest.mark.nightly
def test_callback_kwargs_one_string(callback):
pipe = read_model(get_models_list()[0])[4]
pipe.generate('table is made of', max_new_tokens=10, streamer=callback)

@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("model_descr", get_models_list())
def test_callback_decoding_metallama(model_descr, callback):
# On metallam this prompt generates output which can shorten after adding new tokens.
Expand All @@ -359,6 +370,7 @@ def test_callback_decoding_metallama(model_descr, callback):

@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@pytest.mark.precommit
@pytest.mark.nightly
def test_callback_kwargs_batch_fail(callback):
pipe = read_model(get_models_list()[0])[4]
with pytest.raises(RuntimeError):
Expand All @@ -380,6 +392,7 @@ def end(self):


@pytest.mark.precommit
@pytest.mark.nightly
def test_streamer_one_string():
pipe = read_model(get_models_list()[0])[4]
generation_config = pipe.get_generation_config()
Expand All @@ -389,6 +402,7 @@ def test_streamer_one_string():


@pytest.mark.precommit
@pytest.mark.nightly
def test_streamer_batch_fail():
pipe = read_model(get_models_list()[0])[4]
printer = Printer(pipe.get_tokenizer())
Expand All @@ -397,13 +411,15 @@ def test_streamer_batch_fail():


@pytest.mark.precommit
@pytest.mark.nightly
def test_streamer_kwargs_one_string():
pipe = read_model(get_models_list()[0])[4]
printer = Printer(pipe.get_tokenizer())
pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer)


@pytest.mark.precommit
@pytest.mark.nightly
def test_streamer_kwargs_batch_fail():
pipe = read_model(get_models_list()[0])[4]
printer = Printer(pipe.get_tokenizer())
Expand All @@ -412,6 +428,7 @@ def test_streamer_kwargs_batch_fail():


@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
def test_operator_with_callback_one_string(callback):
pipe = read_model(get_models_list()[0])[4]
Expand All @@ -421,6 +438,7 @@ def test_operator_with_callback_one_string(callback):


@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
def test_operator_with_callback_batch_fail(callback):
pipe = read_model(get_models_list()[0])[4]
Expand All @@ -429,13 +447,15 @@ def test_operator_with_callback_batch_fail(callback):


@pytest.mark.precommit
@pytest.mark.nightly
def test_operator_with_streamer_kwargs_one_string():
pipe = read_model(get_models_list()[0])[4]
printer = Printer(pipe.get_tokenizer())
pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer)


@pytest.mark.precommit
@pytest.mark.nightly
def test_operator_with_streamer_kwargs_batch_fail():
pipe = read_model(get_models_list()[0])[4]
printer = Printer(pipe.get_tokenizer())
Expand All @@ -444,6 +464,7 @@ def test_operator_with_streamer_kwargs_batch_fail():


@pytest.mark.precommit
@pytest.mark.nightly
def test_load_special_tokens_ids_1(model_tmp_path):
# test when there is an available config.json
config_json = {
Expand All @@ -458,6 +479,7 @@ def test_load_special_tokens_ids_1(model_tmp_path):


@pytest.mark.precommit
@pytest.mark.nightly
def test_load_special_tokens_str_2(model_tmp_path):
# test with special_tokens_map
special_tokens_map_json = {
Expand All @@ -472,6 +494,7 @@ def test_load_special_tokens_str_2(model_tmp_path):


@pytest.mark.precommit
@pytest.mark.nightly
def test_load_special_tokens_3_(model_tmp_path):
# special_tokens_map is not available
# but tokenize_config.json exists
Expand All @@ -498,6 +521,7 @@ def test_load_special_tokens_3_(model_tmp_path):


@pytest.mark.precommit
@pytest.mark.nightly
def test_load_special_tokens_3(model_tmp_path):
# both config.json is availabel and tokenizer_config.json available
# check that it does not read int values from tokenizer_config.json if they are in config.json
Expand Down Expand Up @@ -532,6 +556,7 @@ def test_load_special_tokens_3(model_tmp_path):


@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.xfail(
raises=AssertionError,
reason="CVS-143410 ov tokenizer should be aligned with hf",
Expand Down Expand Up @@ -575,6 +600,7 @@ def test_load_special_tokens_4(model_tmp_path):
]
@pytest.mark.parametrize("generation_config", invalid_configs)
@pytest.mark.precommit
@pytest.mark.nightly
def test_invalid_configs(model_tmp_path, generation_config):
model_id, temp_path = model_tmp_path
config_json = {}
Expand All @@ -584,6 +610,7 @@ def test_invalid_configs(model_tmp_path, generation_config):


@pytest.mark.precommit
@pytest.mark.nightly
def test_valid_configs(model_tmp_path):
model_id, temp_path = model_tmp_path
pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
Expand All @@ -602,6 +629,7 @@ def test_valid_configs(model_tmp_path):
dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
]
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("generation_config", invalid_py_configs)
def test_python_generation_config_validation(model_tmp_path, generation_config):
model_id, temp_path = model_tmp_path
Expand All @@ -615,6 +643,7 @@ def test_python_generation_config_validation(model_tmp_path, generation_config):


@pytest.mark.precommit
@pytest.mark.nightly
def test_unicode_pybind_decoding_1():
# On this model this prompt generates unfinished utf string.
# Test that pybind will not fail.
Expand All @@ -626,6 +655,7 @@ def test_unicode_pybind_decoding_1():


@pytest.mark.precommit
@pytest.mark.nightly
def test_unicode_pybind_decoding_2():
# On this model this prompt generates unfinished utf string.
# Test that pybind will not fail.
Expand All @@ -636,6 +666,7 @@ def test_unicode_pybind_decoding_2():


@pytest.mark.precommit
@pytest.mark.nightly
def test_unicode_pybind_decoding_3():
# On this model this prompt generates unfinished utf-8 string
# and streams it. Test that pybind will not fail while we pass string to python.
Expand All @@ -648,6 +679,7 @@ def test_unicode_pybind_decoding_3():

@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win")
def test_left_pad():
# test left pad tokenizer post processing implementation
Expand Down

0 comments on commit bc92248

Please sign in to comment.