Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add Readme for tests #664

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions tests/python_tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# OpenVINO™ GenAI Tests

This tests aim to validate support for vanilla and continuous batching GenAI APIs.

## Setup environemnt

In order to run tests first of all build or install OpenVINO GenAI library, follow instructions [GenAI Library README](../../src/README.md).

Then install requirements for tests:
```sh
pip install -r tests/python_tests/requirements.txt
```

## Run Tests

```sh
python -m pytest tests/python_tests/ -m precommit
```

During the test downloaded HuggingFace (HF) models will be saved into the current directory. If you wish to place them somewhere else you can specify `GENAI_MODELS_PATH_PREFIX` environenment variable, e.g.
```sh
GENAI_MODELS_PATH_PREFIX=$HOME/test_models python -m pytest tests/python_tests/ -m precommit
```

If you have built GenAI library by yourself instead of using wheel please set `PYTHONPATH` so that test could find library, e.g.
```sh
PYTHONPATH=$PYTHONPATH:.../openvino.genai/build-Release/ python -m pytest tests/python_tests/ -m precommit
```

## Customise tests run

Tests have `precommit` and `nightly` set of models. `precommit` contains lightweight models which can be quickly inferred, `nightly` models are heavier and required more time for interence. If you wish to run specific tests only for nightly models, you can use `-k` option, for example to run only multibatch and chat tests:
```sh
python -m pytest tests/python_tests/ -m nightly -k "test_multibatch and test_chat"
```

If you wish to run all tests except beam search do the following:
```sh
python -m pytest tests/python_tests/ -m precommit -k "not test_beam_search"
```

Argument `--model_ids` can be used to run tests selectively only for specific models. HF model ids should be separated by space, e.g:
```sh
python -m pytest ~/devel/openvino.genai/tests/python_tests/ -m nightly -k "test_multibatch" --model_ids "TinyLlama/TinyLlama-1.1B-Chat-v1.0 Qwen/Qwen2-0.5B-Instruct"
pavel-esir marked this conversation as resolved.
Show resolved Hide resolved
```

List of currently supported `nightly` and `precommit` models can be found in tests/python_tests/ov_genai_test_utils.py:get_models_list
7 changes: 6 additions & 1 deletion tests/python_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ def pytest_make_parametrize_id(config, val, argname):
return f'{argname}={val}'
return None

def pytest_configure(config):
def pytest_addoption(parser):
parser.addoption("--model_ids", help="Select models to run")

def pytest_configure(config: pytest.Config):
marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly'
pytest.run_marker = marker
pytest.selected_model_ids = config.getoption('--model_ids', default=None)

5 changes: 4 additions & 1 deletion tests/python_tests/ov_genai_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def get_models_list():
model_ids = precommit_models
else:
model_ids = nightly_models


if pytest.selected_model_ids:
model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
# pytest.set_trace()
prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]

Expand Down
4 changes: 4 additions & 0 deletions tests/python_tests/test_chat_generate_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
@pytest.mark.parametrize("generation_config", configs)
@pytest.mark.parametrize("model_descr", get_chat_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_chat_compare_with_HF(model_descr, generation_config: Dict):
device = 'CPU'
chat_history_hf = []
Expand Down Expand Up @@ -69,6 +70,7 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
@pytest.mark.parametrize("generation_config", configs)
@pytest.mark.parametrize("model_descr", get_chat_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict):
# compares with HF when history in ov_genai is save as a text
device = 'CPU'
Expand Down Expand Up @@ -104,6 +106,7 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict)
@pytest.mark.parametrize("generation_config", configs)
@pytest.mark.parametrize("model_descr", get_chat_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict):
# Check that when history is stored in KV cache results are the same as when history stored in a text.
device ='CPU'
Expand Down Expand Up @@ -144,6 +147,7 @@ def test_chat_compare_statefull_vs_text_history(model_descr, generation_config:
{'role': 'user', 'content': 'What was my first question?'},
]
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize('chat_config', get_chat_templates())
def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
tokenizer_config = chat_config[1]
Expand Down
32 changes: 32 additions & 0 deletions tests/python_tests/test_generate_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def hf_ov_genai_tensors_comparison(
@pytest.mark.parametrize("generation_config,prompt", test_cases)
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_decoding(model_descr, generation_config, prompt):
run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)

Expand All @@ -168,6 +169,7 @@ def test_decoding(model_descr, generation_config, prompt):
condition=sys.platform == "linux"
)
@pytest.mark.precommit
@pytest.mark.nightly
def test_ov_tensors(model_descr, inputs):
hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs)

Expand All @@ -182,6 +184,7 @@ def test_ov_tensors(model_descr, inputs):
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.parametrize("prompt", prompts)
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.xfail(
raises=TypeError,
reason="pybind was unable to find ov::Tensor from openvino yet",
Expand Down Expand Up @@ -217,6 +220,7 @@ def test_genai_tokenizer_encode(model_descr, prompt):
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.parametrize("encoded_prompt", encoded_prompts)
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.xfail(
raises=TypeError,
reason="pybind was unable to find ov::Tensor from openvino yet",
Expand Down Expand Up @@ -252,6 +256,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt):
@pytest.mark.parametrize("prompts", batched_prompts)
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_multibatch(model_descr, generation_config, prompts):
run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)

Expand All @@ -264,6 +269,7 @@ def test_multibatch(model_descr, generation_config, prompts):
@pytest.mark.parametrize("prompt", prompts)
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
max_new_tokens, diversity_penalty, prompt):
generation_config = dict(
Expand All @@ -281,6 +287,7 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
@pytest.mark.parametrize("max_new_tokens", [10, 80])
@pytest.mark.parametrize("model_descr", get_models_list())
@pytest.mark.precommit
@pytest.mark.nightly
def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
# todo: with EARLY stop_criteria looks like HF return unvalid out with sentence<eos><unk><unk>
# while genai ends sentence with <eos>
Expand Down Expand Up @@ -323,6 +330,7 @@ def user_defined_callback(subword):

@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@pytest.mark.precommit
@pytest.mark.nightly
def test_callback_one_string(callback):
pipe = read_model(get_models_list()[0])[4]
generation_config = pipe.get_generation_config()
Expand All @@ -332,6 +340,7 @@ def test_callback_one_string(callback):

@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@pytest.mark.precommit
@pytest.mark.nightly
def test_callback_batch_fail(callback):
pipe = read_model(get_models_list()[0])[4]
with pytest.raises(RuntimeError):
Expand All @@ -340,12 +349,14 @@ def test_callback_batch_fail(callback):

@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@pytest.mark.precommit
@pytest.mark.nightly
def test_callback_kwargs_one_string(callback):
pipe = read_model(get_models_list()[0])[4]
pipe.generate('table is made of', max_new_tokens=10, streamer=callback)

@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("model_descr", get_models_list())
def test_callback_decoding_metallama(model_descr, callback):
# On metallam this prompt generates output which can shorten after adding new tokens.
Expand All @@ -359,6 +370,7 @@ def test_callback_decoding_metallama(model_descr, callback):

@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
@pytest.mark.precommit
@pytest.mark.nightly
def test_callback_kwargs_batch_fail(callback):
pipe = read_model(get_models_list()[0])[4]
with pytest.raises(RuntimeError):
Expand All @@ -380,6 +392,7 @@ def end(self):


@pytest.mark.precommit
@pytest.mark.nightly
def test_streamer_one_string():
pipe = read_model(get_models_list()[0])[4]
generation_config = pipe.get_generation_config()
Expand All @@ -389,6 +402,7 @@ def test_streamer_one_string():


@pytest.mark.precommit
@pytest.mark.nightly
def test_streamer_batch_fail():
pipe = read_model(get_models_list()[0])[4]
printer = Printer(pipe.get_tokenizer())
Expand All @@ -397,13 +411,15 @@ def test_streamer_batch_fail():


@pytest.mark.precommit
@pytest.mark.nightly
def test_streamer_kwargs_one_string():
pipe = read_model(get_models_list()[0])[4]
printer = Printer(pipe.get_tokenizer())
pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer)


@pytest.mark.precommit
@pytest.mark.nightly
def test_streamer_kwargs_batch_fail():
pipe = read_model(get_models_list()[0])[4]
printer = Printer(pipe.get_tokenizer())
Expand All @@ -412,6 +428,7 @@ def test_streamer_kwargs_batch_fail():


@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
def test_operator_with_callback_one_string(callback):
pipe = read_model(get_models_list()[0])[4]
Expand All @@ -421,6 +438,7 @@ def test_operator_with_callback_one_string(callback):


@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
def test_operator_with_callback_batch_fail(callback):
pipe = read_model(get_models_list()[0])[4]
Expand All @@ -429,13 +447,15 @@ def test_operator_with_callback_batch_fail(callback):


@pytest.mark.precommit
@pytest.mark.nightly
def test_operator_with_streamer_kwargs_one_string():
pipe = read_model(get_models_list()[0])[4]
printer = Printer(pipe.get_tokenizer())
pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer)


@pytest.mark.precommit
@pytest.mark.nightly
def test_operator_with_streamer_kwargs_batch_fail():
pipe = read_model(get_models_list()[0])[4]
printer = Printer(pipe.get_tokenizer())
Expand All @@ -444,6 +464,7 @@ def test_operator_with_streamer_kwargs_batch_fail():


@pytest.mark.precommit
@pytest.mark.nightly
def test_load_special_tokens_ids_1(model_tmp_path):
# test when there is an available config.json
config_json = {
Expand All @@ -458,6 +479,7 @@ def test_load_special_tokens_ids_1(model_tmp_path):


@pytest.mark.precommit
@pytest.mark.nightly
def test_load_special_tokens_str_2(model_tmp_path):
# test with special_tokens_map
special_tokens_map_json = {
Expand All @@ -472,6 +494,7 @@ def test_load_special_tokens_str_2(model_tmp_path):


@pytest.mark.precommit
@pytest.mark.nightly
def test_load_special_tokens_3_(model_tmp_path):
# special_tokens_map is not available
# but tokenize_config.json exists
Expand All @@ -498,6 +521,7 @@ def test_load_special_tokens_3_(model_tmp_path):


@pytest.mark.precommit
@pytest.mark.nightly
def test_load_special_tokens_3(model_tmp_path):
# both config.json is availabel and tokenizer_config.json available
# check that it does not read int values from tokenizer_config.json if they are in config.json
Expand Down Expand Up @@ -532,6 +556,7 @@ def test_load_special_tokens_3(model_tmp_path):


@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.xfail(
raises=AssertionError,
reason="CVS-143410 ov tokenizer should be aligned with hf",
Expand Down Expand Up @@ -575,6 +600,7 @@ def test_load_special_tokens_4(model_tmp_path):
]
@pytest.mark.parametrize("generation_config", invalid_configs)
@pytest.mark.precommit
@pytest.mark.nightly
def test_invalid_configs(model_tmp_path, generation_config):
model_id, temp_path = model_tmp_path
config_json = {}
Expand All @@ -584,6 +610,7 @@ def test_invalid_configs(model_tmp_path, generation_config):


@pytest.mark.precommit
@pytest.mark.nightly
def test_valid_configs(model_tmp_path):
model_id, temp_path = model_tmp_path
pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
Expand All @@ -602,6 +629,7 @@ def test_valid_configs(model_tmp_path):
dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
]
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.parametrize("generation_config", invalid_py_configs)
def test_python_generation_config_validation(model_tmp_path, generation_config):
model_id, temp_path = model_tmp_path
Expand All @@ -615,6 +643,7 @@ def test_python_generation_config_validation(model_tmp_path, generation_config):


@pytest.mark.precommit
@pytest.mark.nightly
def test_unicode_pybind_decoding_1():
# On this model this prompt generates unfinished utf string.
# Test that pybind will not fail.
Expand All @@ -626,6 +655,7 @@ def test_unicode_pybind_decoding_1():


@pytest.mark.precommit
@pytest.mark.nightly
def test_unicode_pybind_decoding_2():
# On this model this prompt generates unfinished utf string.
# Test that pybind will not fail.
Expand All @@ -636,6 +666,7 @@ def test_unicode_pybind_decoding_2():


@pytest.mark.precommit
@pytest.mark.nightly
def test_unicode_pybind_decoding_3():
# On this model this prompt generates unfinished utf-8 string
# and streams it. Test that pybind will not fail while we pass string to python.
Expand All @@ -648,6 +679,7 @@ def test_unicode_pybind_decoding_3():

@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
@pytest.mark.precommit
@pytest.mark.nightly
@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win")
def test_left_pad():
# test left pad tokenizer post processing implementation
Expand Down
Loading