Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into prefix_caching
Browse files Browse the repository at this point in the history
  • Loading branch information
popovaan committed Jul 17, 2024
2 parents a154dba + 7f5e8d2 commit 995e4eb
Show file tree
Hide file tree
Showing 35 changed files with 2,085 additions and 736 deletions.
99 changes: 8 additions & 91 deletions .github/workflows/genai_python_lib.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ env:
jobs:
ubuntu_genai_python_lib:
# A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
runs-on: ubuntu-22.04
runs-on: ubuntu-22.04-16-cores
env:
# A tokenizers' dependency fails to compile with Ninja in CenOS7 env.
CMAKE_GENERATOR: Unix Makefiles
Expand All @@ -30,9 +30,9 @@ jobs:
- run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
- run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
- run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
- run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
- run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
- run: source ./ov/setupvars.sh && python -m pip install . --verbose
- run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
- run: python -m pytest ./tests/python_tests/

macos_genai_python_lib:
runs-on: macos-12
Expand All @@ -53,13 +53,12 @@ jobs:
- run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
- run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
- run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
- run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
- run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
- run: source ./ov/setupvars.sh && python -m pip install . --verbose
- run: python -c "from openvino_genai import LLMPipeline"
- run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
- run: python -m pytest ./tests/python_tests/

windows_genai_python_lib:
if: false
runs-on: windows-latest
env:
CMAKE_BUILD_PARALLEL_LEVEL: null
Expand All @@ -73,62 +72,6 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: 3.8
- run: curl --output ov.zip ${{ env.l_ov_link }}
- run: unzip -d ov ov.zip
- run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
shell: bash
- run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
- run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j
- run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
# cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
- run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
- run: call ./ov/setupvars.bat && python -m pip install . --verbose
- run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit

continuous_batching_python_lib_ubuntu:
# A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
runs-on: ubuntu-22.04
env:
# A tokenizers' dependency fails to compile with Ninja in CenOS7 env.
CMAKE_GENERATOR: Unix Makefiles
CMAKE_BUILD_PARALLEL_LEVEL: null
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.8
# Install CentOS7 instead of Ubuntu to match PyPI distribution ABI.
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_centos_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Install dependencies and build
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_sampling.py -m precommit
- run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_preemption.py -m precommit
- run: source ./ov/setupvars.sh && python -m pip install .
- run: python -m pytest ./tests/python_tests/test_preemption.py -m precommit

continuous_batching_python_lib_windows:
runs-on: windows-latest
defaults:
run:
shell: cmd
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.8

- name: Install OpenVINO
run: |
curl --output ov.zip ${{ env.w_ov_link }}
Expand All @@ -141,33 +84,7 @@ jobs:
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_sampling.py -m precommit
- run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_preemption.py -m precommit
# cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
- run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/
- run: call ./ov/setupvars.bat && python -m pip install . --verbose
- run: python -m pytest ./tests/python_tests/test_preemption.py -m precommit


continuous_batching_python_lib_macos:
runs-on: macos-12
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
brew install coreutils scons
- name: Download, convert and build
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_sampling.py -m precommit
- run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_preemption.py -m precommit
- run: source ./ov/setupvars.sh && python -m pip install .
- run: python -m pytest ./tests/python_tests/test_preemption.py -m precommit
- run: python -m pytest ./tests/python_tests/
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@ std::vector<int64_t> LMSDiscreteScheduler::get_timesteps() const {
}

std::map<std::string, ov::Tensor> LMSDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) {
if (inference_step == 0) {
m_derivative_list.clear();
}

// LMS step function:
std::vector<float> derivative;
derivative.reserve(latents.get_size());
Expand Down
2 changes: 1 addition & 1 deletion image_generation/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
-r ../samples/requirements.txt
diffusers==0.27.2
diffusers==0.29.2
8 changes: 4 additions & 4 deletions image_generation/stable_diffusion_1_5/cpp/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,11 +365,11 @@ int32_t main(int32_t argc, char* argv[]) try {

ov::Tensor text_embeddings = text_encoder(models, positive_prompt, negative_prompt, do_classifier_free_guidance);

for (uint32_t n = 0; n < num_images; n++) {
std::shared_ptr<Scheduler> scheduler = std::make_shared<LMSDiscreteScheduler>();
scheduler->set_timesteps(num_inference_steps);
std::vector<std::int64_t> timesteps = scheduler->get_timesteps();
std::shared_ptr<Scheduler> scheduler = std::make_shared<LMSDiscreteScheduler>();
scheduler->set_timesteps(num_inference_steps);
std::vector<std::int64_t> timesteps = scheduler->get_timesteps();

for (uint32_t n = 0; n < num_images; n++) {
std::uint32_t seed = num_images == 1 ? user_seed : user_seed + n;

const size_t unet_in_channels = static_cast<size_t>(sample_shape[1].get_length());
Expand Down
7 changes: 6 additions & 1 deletion llm_bench/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
mem_consumption.start_collect_memory_consumption()
max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
start = time.perf_counter()
if args['infer_count'] is not None:
if args['infer_count'] is not None and args['end_token_stopping'] is False:
model.generation_config.eos_token_id = None
model.config.eos_token_id = None
result = model.generate(
Expand Down Expand Up @@ -693,6 +693,11 @@ def get_argprser():
parser.add_argument('-od', '--output_dir', help='Save the input text and generated text, images to files')
utils.model_utils.add_stateful_model_arguments(parser)
parser.add_argument("--genai", action="store_true")
parser.add_argument(
'--end_token_stopping',
action='store_true',
help='Stop the generation even output token size does not achieve infer_count or max token size ({DEFAULT_OUTPUT_TOKEN_SIZE}}).'
)

return parser.parse_args()

Expand Down
1 change: 1 addition & 0 deletions llm_bench/python/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def analyze_args(args):
if model_args['prompt_index'] is not None:
# Deduplication
[model_args['prompt_index'].append(i) for i in args.prompt_index if i not in model_args['prompt_index']]
model_args['end_token_stopping'] = args.end_token_stopping

model_framework = args.framework
model_path = Path(args.model)
Expand Down
13 changes: 9 additions & 4 deletions llm_bench/python/utils/nncf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,9 @@ def get_compressed_path(output_dir: str, base_precision, option: str):


INT4_MODEL_CONFIGURATION = {
"dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
"dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
"gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
"opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
"bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6},
"red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
"zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8,
"dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
Expand All @@ -58,7 +57,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
"rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
"chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72},
"qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
"open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
"open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 1.0, "all_layers": True},
"falcon-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
"orca-mini-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True,
"dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": False}},
Expand All @@ -70,7 +69,13 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
"mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9},
"llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
"opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
"red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
"red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
"vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0},
"stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
"gpt-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.5, "scale": True},
"longchat-b7": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
"starcoder2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
"tiny-llama-1.1b-chat": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
"stablelm-7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.6, "scale": True},
"phi-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ classifiers = [
"Programming Language :: Python :: 3.12",
]
dependencies = [
"openvino_tokenizers~=2024.3.0.0"
"openvino_tokenizers~=2024.3.0.0.dev"
]

[tool.py-build-cmake.module]
Expand Down
2 changes: 1 addition & 1 deletion samples/cpp/chat_sample/chat_sample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ int main(int argc, char* argv[]) try {
std::string prompt;
std::string model_path = argv[1];

std::string device = "CPU"; // GPU can be used as well
std::string device = "CPU"; // GPU, NPU can be used as well
ov::genai::LLMPipeline pipe(model_path, "CPU");

ov::genai::GenerationConfig config;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
#include <mutex>
#include <atomic>


#include <openvino/openvino.hpp>
#include <nlohmann/json.hpp>
#include <cxxopts.hpp>

Expand Down Expand Up @@ -123,6 +121,11 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data

ov::genai::GenerationConfig greedy_search = ov::genai::greedy();
greedy_search.max_new_tokens = std::min(max_output_len, output_len);
greedy_search.repetition_penalty = 1.0;
greedy_search.frequency_penalty = 0.0;
greedy_search.presence_penalty = 0.0;
greedy_search.diversity_penalty = 0.0;
greedy_search.length_penalty = 0.0;

dataset.push_data(human_question, greedy_search);
dataset.push_lens(input_len, output_len);
Expand Down
15 changes: 14 additions & 1 deletion src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,20 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
GenerationConfig get_generation_config() const;
void set_generation_config(const GenerationConfig& config);

void start_chat();

/**
* @brief start chat with keeping history in kv cache.
* Turns on keeping KV cache between generate calls and automatic applying of chat templates.
* In case if beam search is used, KV cache is kept fot the generated sequence with maximal scores.
*
* @param system_message optional system message.
*/
void start_chat(const std::string& system_message = "");

/**
* @brief finish chat and clear kv cache.
* Turns off keeping KV cache between generate calls.
*/
void finish_chat();
private:
std::unique_ptr<LLMPipelineImplBase> m_pimpl;
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
* @return A string with the transformed and concatenated prompts from the chat history.
* @throws Exception if the chat template was unable to parse the input history.
*/
std::string apply_chat_template(const ChatHistory& history,
std::string apply_chat_template(ChatHistory history,
bool add_generation_prompt,
const std::string& chat_template="") const;

Expand Down
1 change: 1 addition & 0 deletions src/cpp/src/continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class ContinuousBatchingPipeline::Impl {
for (const auto& sequence: request->get_sequences()) {
m_scheduler->free_sequence(sequence->get_id());
}
m_sampler->clear_beam_search_info(request->get_request_id());
requests_iterator = m_requests.erase(requests_iterator);
} else {
requests_iterator++;
Expand Down
Loading

0 comments on commit 995e4eb

Please sign in to comment.