diff --git a/tests/python_tests/README.md b/tests/python_tests/README.md new file mode 100644 index 0000000000..e5381708de --- /dev/null +++ b/tests/python_tests/README.md @@ -0,0 +1,47 @@ +# OpenVINO™ GenAI Tests + +This tests aim to validate support for vanilla and continuous batching GenAI APIs. + +## Setup environemnt + +In order to run tests first of all build or install OpenVINO GenAI library, follow instructions [GenAI Library README](../../src/README.md). + +Then install requirements for tests: +```sh +pip install -r tests/python_tests/requirements.txt +``` + +## Run Tests + +```sh +python -m pytest tests/python_tests/ -m precommit +``` + +During the test downloaded HuggingFace (HF) models will be saved into the current directory. If you wish to place them somewhere else you can specify `GENAI_MODELS_PATH_PREFIX` environenment variable, e.g. +```sh +GENAI_MODELS_PATH_PREFIX=$HOME/test_models python -m pytest tests/python_tests/ -m precommit +``` + +If you have built GenAI library by yourself instead of using wheel please set `PYTHONPATH` so that test could find library, e.g. +```sh +PYTHONPATH=$PYTHONPATH:.../openvino.genai/build-Release/ python -m pytest tests/python_tests/ -m precommit +``` + +## Customise tests run + +Tests have `precommit` and `nightly` set of models. `precommit` contains lightweight models which can be quickly inferred, `nightly` models are heavier and required more time for interence. If you wish to run specific tests only for nightly models, you can use `-k` option, for example to run only multibatch and chat tests: +```sh +python -m pytest tests/python_tests/ -m nightly -k "test_multibatch and test_chat" +``` + +If you wish to run all tests except beam search do the following: +```sh +python -m pytest tests/python_tests/ -m precommit -k "not test_beam_search" +``` + +Argument `--model_ids` can be used to run tests selectively only for specific models. HF model ids should be separated by space, e.g: +```sh +python -m pytest tests/python_tests/ -m nightly -k "test_multibatch" --model_ids "TinyLlama/TinyLlama-1.1B-Chat-v1.0 Qwen/Qwen2-0.5B-Instruct" +``` + +List of currently supported `nightly` and `precommit` models can be found in tests/python_tests/ov_genai_test_utils.py:get_models_list diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py index 66212468af..f98f47ecf3 100644 --- a/tests/python_tests/conftest.py +++ b/tests/python_tests/conftest.py @@ -14,6 +14,11 @@ def pytest_make_parametrize_id(config, val, argname): return f'{argname}={val}' return None -def pytest_configure(config): +def pytest_addoption(parser): + parser.addoption("--model_ids", help="Select models to run") + +def pytest_configure(config: pytest.Config): marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly' pytest.run_marker = marker + pytest.selected_model_ids = config.getoption('--model_ids', default=None) + diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 4ba71a1d48..bc95418aff 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -49,7 +49,10 @@ def get_models_list(): model_ids = precommit_models else: model_ids = nightly_models - + + if pytest.selected_model_ids: + model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + # pytest.set_trace() prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py index 94de8f6cc2..5a73d481d3 100644 --- a/tests/python_tests/test_chat_generate_api.py +++ b/tests/python_tests/test_chat_generate_api.py @@ -33,6 +33,7 @@ @pytest.mark.parametrize("generation_config", configs) @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_chat_compare_with_HF(model_descr, generation_config: Dict): device = 'CPU' chat_history_hf = [] @@ -69,6 +70,7 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict): @pytest.mark.parametrize("generation_config", configs) @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict): # compares with HF when history in ov_genai is save as a text device = 'CPU' @@ -104,6 +106,7 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict) @pytest.mark.parametrize("generation_config", configs) @pytest.mark.parametrize("model_descr", get_chat_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict): # Check that when history is stored in KV cache results are the same as when history stored in a text. device ='CPU' @@ -144,6 +147,7 @@ def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: {'role': 'user', 'content': 'What was my first question?'}, ] @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.parametrize('chat_config', get_chat_templates()) def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): tokenizer_config = chat_config[1] diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index 40eba92277..e2395cf8d7 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -151,6 +151,7 @@ def hf_ov_genai_tensors_comparison( @pytest.mark.parametrize("generation_config,prompt", test_cases) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_decoding(model_descr, generation_config, prompt): run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) @@ -168,6 +169,7 @@ def test_decoding(model_descr, generation_config, prompt): condition=sys.platform == "linux" ) @pytest.mark.precommit +@pytest.mark.nightly def test_ov_tensors(model_descr, inputs): hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs) @@ -182,6 +184,7 @@ def test_ov_tensors(model_descr, inputs): @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.parametrize("prompt", prompts) @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.xfail( raises=TypeError, reason="pybind was unable to find ov::Tensor from openvino yet", @@ -217,6 +220,7 @@ def test_genai_tokenizer_encode(model_descr, prompt): @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.parametrize("encoded_prompt", encoded_prompts) @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.xfail( raises=TypeError, reason="pybind was unable to find ov::Tensor from openvino yet", @@ -252,6 +256,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt): @pytest.mark.parametrize("prompts", batched_prompts) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_multibatch(model_descr, generation_config, prompts): run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts) @@ -264,6 +269,7 @@ def test_multibatch(model_descr, generation_config, prompts): @pytest.mark.parametrize("prompt", prompts) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt): generation_config = dict( @@ -281,6 +287,7 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size, @pytest.mark.parametrize("max_new_tokens", [10, 80]) @pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence # while genai ends sentence with @@ -323,6 +330,7 @@ def user_defined_callback(subword): @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit +@pytest.mark.nightly def test_callback_one_string(callback): pipe = read_model(get_models_list()[0])[4] generation_config = pipe.get_generation_config() @@ -332,6 +340,7 @@ def test_callback_one_string(callback): @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit +@pytest.mark.nightly def test_callback_batch_fail(callback): pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): @@ -340,12 +349,14 @@ def test_callback_batch_fail(callback): @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit +@pytest.mark.nightly def test_callback_kwargs_one_string(callback): pipe = read_model(get_models_list()[0])[4] pipe.generate('table is made of', max_new_tokens=10, streamer=callback) @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.parametrize("model_descr", get_models_list()) def test_callback_decoding_metallama(model_descr, callback): # On metallam this prompt generates output which can shorten after adding new tokens. @@ -359,6 +370,7 @@ def test_callback_decoding_metallama(model_descr, callback): @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit +@pytest.mark.nightly def test_callback_kwargs_batch_fail(callback): pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): @@ -380,6 +392,7 @@ def end(self): @pytest.mark.precommit +@pytest.mark.nightly def test_streamer_one_string(): pipe = read_model(get_models_list()[0])[4] generation_config = pipe.get_generation_config() @@ -389,6 +402,7 @@ def test_streamer_one_string(): @pytest.mark.precommit +@pytest.mark.nightly def test_streamer_batch_fail(): pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) @@ -397,6 +411,7 @@ def test_streamer_batch_fail(): @pytest.mark.precommit +@pytest.mark.nightly def test_streamer_kwargs_one_string(): pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) @@ -404,6 +419,7 @@ def test_streamer_kwargs_one_string(): @pytest.mark.precommit +@pytest.mark.nightly def test_streamer_kwargs_batch_fail(): pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) @@ -412,6 +428,7 @@ def test_streamer_kwargs_batch_fail(): @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) def test_operator_with_callback_one_string(callback): pipe = read_model(get_models_list()[0])[4] @@ -421,6 +438,7 @@ def test_operator_with_callback_one_string(callback): @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) def test_operator_with_callback_batch_fail(callback): pipe = read_model(get_models_list()[0])[4] @@ -429,6 +447,7 @@ def test_operator_with_callback_batch_fail(callback): @pytest.mark.precommit +@pytest.mark.nightly def test_operator_with_streamer_kwargs_one_string(): pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) @@ -436,6 +455,7 @@ def test_operator_with_streamer_kwargs_one_string(): @pytest.mark.precommit +@pytest.mark.nightly def test_operator_with_streamer_kwargs_batch_fail(): pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) @@ -444,6 +464,7 @@ def test_operator_with_streamer_kwargs_batch_fail(): @pytest.mark.precommit +@pytest.mark.nightly def test_load_special_tokens_ids_1(model_tmp_path): # test when there is an available config.json config_json = { @@ -458,6 +479,7 @@ def test_load_special_tokens_ids_1(model_tmp_path): @pytest.mark.precommit +@pytest.mark.nightly def test_load_special_tokens_str_2(model_tmp_path): # test with special_tokens_map special_tokens_map_json = { @@ -472,6 +494,7 @@ def test_load_special_tokens_str_2(model_tmp_path): @pytest.mark.precommit +@pytest.mark.nightly def test_load_special_tokens_3_(model_tmp_path): # special_tokens_map is not available # but tokenize_config.json exists @@ -498,6 +521,7 @@ def test_load_special_tokens_3_(model_tmp_path): @pytest.mark.precommit +@pytest.mark.nightly def test_load_special_tokens_3(model_tmp_path): # both config.json is availabel and tokenizer_config.json available # check that it does not read int values from tokenizer_config.json if they are in config.json @@ -532,6 +556,7 @@ def test_load_special_tokens_3(model_tmp_path): @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.xfail( raises=AssertionError, reason="CVS-143410 ov tokenizer should be aligned with hf", @@ -575,6 +600,7 @@ def test_load_special_tokens_4(model_tmp_path): ] @pytest.mark.parametrize("generation_config", invalid_configs) @pytest.mark.precommit +@pytest.mark.nightly def test_invalid_configs(model_tmp_path, generation_config): model_id, temp_path = model_tmp_path config_json = {} @@ -584,6 +610,7 @@ def test_invalid_configs(model_tmp_path, generation_config): @pytest.mark.precommit +@pytest.mark.nightly def test_valid_configs(model_tmp_path): model_id, temp_path = model_tmp_path pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path) @@ -602,6 +629,7 @@ def test_valid_configs(model_tmp_path): dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k ] @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.parametrize("generation_config", invalid_py_configs) def test_python_generation_config_validation(model_tmp_path, generation_config): model_id, temp_path = model_tmp_path @@ -615,6 +643,7 @@ def test_python_generation_config_validation(model_tmp_path, generation_config): @pytest.mark.precommit +@pytest.mark.nightly def test_unicode_pybind_decoding_1(): # On this model this prompt generates unfinished utf string. # Test that pybind will not fail. @@ -626,6 +655,7 @@ def test_unicode_pybind_decoding_1(): @pytest.mark.precommit +@pytest.mark.nightly def test_unicode_pybind_decoding_2(): # On this model this prompt generates unfinished utf string. # Test that pybind will not fail. @@ -636,6 +666,7 @@ def test_unicode_pybind_decoding_2(): @pytest.mark.precommit +@pytest.mark.nightly def test_unicode_pybind_decoding_3(): # On this model this prompt generates unfinished utf-8 string # and streams it. Test that pybind will not fail while we pass string to python. @@ -648,6 +679,7 @@ def test_unicode_pybind_decoding_3(): @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory") @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win") def test_left_pad(): # test left pad tokenizer post processing implementation