From 86fc0492f49a40b15198c82e47bba31f42d48f77 Mon Sep 17 00:00:00 2001 From: Guancheng Fu <110874468+gc-fu@users.noreply.github.com> Date: Fri, 26 Jul 2024 09:38:39 +0800 Subject: [PATCH] Update oneccl used (#11647) * Add internal oneccl * fix * fix * add oneccl --- docker/llm/serving/xpu/docker/Dockerfile | 33 ++- .../xpu/docker/gradio_web_server.patch | 208 ++++++++++++++++++ .../serving/xpu/docker/oneccl-binding.patch | 14 ++ .../start-lightweight_serving-service.sh | 1 + .../xpu/docker/start-pp_serving-service.sh | 3 +- .../serving/xpu/docker/start-vllm-service.sh | 3 +- 6 files changed, 259 insertions(+), 3 deletions(-) create mode 100644 docker/llm/serving/xpu/docker/gradio_web_server.patch create mode 100644 docker/llm/serving/xpu/docker/oneccl-binding.patch diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile index 7a5d962e32c..2938ce4618b 100644 --- a/docker/llm/serving/xpu/docker/Dockerfile +++ b/docker/llm/serving/xpu/docker/Dockerfile @@ -1,3 +1,22 @@ +FROM intelanalytics/ipex-llm-serving-xpu:latest as build + +ARG http_proxy +ARG https_proxy + +ADD ./oneccl-binding.patch /tmp/oneccl-binding.patch + +RUN cd /tmp/ && \ + pip install --upgrade setuptools wheel twine && \ + pip install "setuptools<70.0.0" && \ + git clone https://github.com/intel/torch-ccl -b v2.1.100+xpu && \ + cd torch-ccl && \ + patch -p1 < /tmp/oneccl-binding.patch && \ + git submodule sync && \ + git submodule update --init --recursive && \ + COMPUTE_BACKEND=dpcpp python setup.py sdist bdist_wheel && \ + mv /tmp/torch-ccl/dist/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl /tmp/ + + FROM intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT ARG http_proxy @@ -5,12 +24,15 @@ ARG https_proxy # Disable pip's cache behavior ARG PIP_NO_CACHE_DIR=false +COPY --from=build /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl /tmp/ +ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch # Install Serving Dependencies # Install ipex-llm[serving] only will update ipex_llm source code without updating # bigdl-core-xe, which will lead to problems RUN apt-get update && \ apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev && \ + apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \ pip install --pre --upgrade ipex-llm[xpu,serving] && \ pip install transformers==4.37.0 gradio==4.19.2 && \ # Install vLLM-v2 dependencies @@ -24,7 +46,16 @@ RUN apt-get update && \ pip install transformers_stream_generator einops tiktoken && \ # For pipeline serving support pip install mpi4py fastapi uvicorn openai && \ - pip install gradio # for gradio web UI + # for gradio web UI + pip install gradio && \ + # Install internal oneccl && \ + cd /tmp/ && \ + wget https://sourceforge.net/projects/oneccl-wks/files/oneccl_wks_installer_2024.0.0.2.sh && \ + bash oneccl_wks_installer_2024.0.0.2.sh && \ + pip uninstall -y oneccl_bind_pt && \ + pip install /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl && \ + rm /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl && \ + patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch COPY ./vllm_online_benchmark.py /llm/ COPY ./vllm_offline_inference.py /llm/ diff --git a/docker/llm/serving/xpu/docker/gradio_web_server.patch b/docker/llm/serving/xpu/docker/gradio_web_server.patch new file mode 100644 index 00000000000..807e0f22231 --- /dev/null +++ b/docker/llm/serving/xpu/docker/gradio_web_server.patch @@ -0,0 +1,208 @@ +--- gradio_web_server.py 2024-06-20 14:21:48.013518726 +0800 ++++ gradio_web_server_new.py 2024-06-20 14:23:09.822830709 +0800 +@@ -9,8 +9,10 @@ + import json + import os + import random ++import pandas as pd + import time + import uuid ++import numpy as np + + import gradio as gr + import requests +@@ -241,7 +243,7 @@ + ip = get_ip(request) + logger.info(f"clear_history. ip: {ip}") + state = None +- return (state, [], "", None) + (disable_btn,) * 5 ++ return (state, [], "", None, "", "", "", "") + (disable_btn,) * 5 + + + def get_ip(request: gr.Request): +@@ -354,6 +356,18 @@ + return None + + ++def handle_latency_metrics(first_token_time, next_token_time): ++ # next token time is a numpy array... ++ # first token time might be None ++ first_token_latency = "None" ++ next_token_latency = "None" ++ if first_token_time is not None: ++ first_token_latency = str(first_token_time * 1000) + " ms" ++ if next_token_time.size > 0: ++ next_token_latency = str(np.mean(next_token_time) * 1000) + " ms" ++ return first_token_latency, next_token_latency ++ ++ + def bot_response( + state, + temperature, +@@ -372,7 +386,7 @@ + if state.skip_next: + # This generate call is skipped due to invalid inputs + state.skip_next = False +- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5 ++ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5 + return + + if apply_rate_limit: +@@ -381,7 +395,7 @@ + error_msg = RATE_LIMIT_MSG + "\n\n" + ret["reason"] + logger.info(f"rate limit reached. ip: {ip}. error_msg: {ret['reason']}") + state.conv.update_last_message(error_msg) +- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5 ++ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5 + return + + conv, model_name = state.conv, state.model_name +@@ -404,6 +418,10 @@ + yield ( + state, + state.to_gradio_chatbot(), ++ "None", ++ "None", ++ "None", ++ "None", + disable_btn, + disable_btn, + disable_btn, +@@ -444,18 +462,32 @@ + ) + + conv.update_last_message("▌") +- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5 ++ # We probably need to change this method ++ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (disable_btn,) * 5 ++ prompt_tokens = 0 ++ generated_tokens = 0 ++ first_token_latency = None ++ next_token_latencies = np.array([]) ++ start_time = time.time() + + try: + for i, data in enumerate(stream_iter): + if data["error_code"] == 0: ++ prompt_tokens = data["usage"]["prompt_tokens"] ++ generated_tokens = data["usage"]["completion_tokens"] + output = data["text"].strip() + conv.update_last_message(output + "▌") +- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5 ++ if first_token_latency is None: ++ first_token_latency = time.time() - start_time ++ else: ++ next_token_latencies = np.append(next_token_latencies, time.time() - start_time) ++ start_time = time.time() ++ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies) ++ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (disable_btn,) * 5 + else: + output = data["text"] + f"\n\n(error_code: {data['error_code']})" + conv.update_last_message(output) +- yield (state, state.to_gradio_chatbot()) + ( ++ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + ( + disable_btn, + disable_btn, + disable_btn, +@@ -465,13 +497,14 @@ + return + output = data["text"].strip() + conv.update_last_message(output) +- yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5 ++ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies) ++ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (enable_btn,) * 5 + except requests.exceptions.RequestException as e: + conv.update_last_message( + f"{SERVER_ERROR_MSG}\n\n" + f"(error_code: {ErrorCode.GRADIO_REQUEST_ERROR}, {e})" + ) +- yield (state, state.to_gradio_chatbot()) + ( ++ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + ( + disable_btn, + disable_btn, + disable_btn, +@@ -484,7 +517,7 @@ + f"{SERVER_ERROR_MSG}\n\n" + f"(error_code: {ErrorCode.GRADIO_STREAM_UNKNOWN_ERROR}, {e})" + ) +- yield (state, state.to_gradio_chatbot()) + ( ++ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + ( + disable_btn, + disable_btn, + disable_btn, +@@ -646,7 +679,8 @@ + ) + + notice_markdown = f""" +-# 🏔️ Chat with Open Large Language Models ++# 🏔️ ChatBot based Xeon-W & Arc GPUs ++### Deployed with IPEX-LLM + {promotion} + """ + +@@ -691,6 +725,26 @@ + regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False) + clear_btn = gr.Button(value="🗑️ Clear history", interactive=False) + ++ with gr.Row(): ++ with gr.Column(): ++ gr.Markdown("### Performance Metrics") ++ prompt_token = gr.Textbox( ++ label="Prompt token length:", ++ interactive=False, ++ ) ++ next_token = gr.Textbox( ++ label="Generated token length:", ++ interactive=False, ++ ) ++ first_token_latency = gr.Textbox( ++ interactive=False, ++ label="First token Latency:", ++ ) ++ next_token_latency = gr.Textbox( ++ interactive=False, ++ label="Next token Latency:", ++ ) ++ + with gr.Accordion("Parameters", open=False) as parameter_row: + temperature = gr.Slider( + minimum=0.0, +@@ -743,9 +797,9 @@ + ).then( + bot_response, + [state, temperature, top_p, max_output_tokens], +- [state, chatbot] + btn_list, ++ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list, + ) +- clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list) ++ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list) + + model_selector.change( + clear_history, None, [state, chatbot, textbox, imagebox] + btn_list +@@ -758,7 +812,7 @@ + ).then( + bot_response, + [state, temperature, top_p, max_output_tokens], +- [state, chatbot] + btn_list, ++ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list, + ) + send_btn.click( + add_text, +@@ -767,7 +821,7 @@ + ).then( + bot_response, + [state, temperature, top_p, max_output_tokens], +- [state, chatbot] + btn_list, ++ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list, + ) + + return [state, model_selector] +@@ -775,7 +829,7 @@ + + def build_demo(models): + with gr.Blocks( +- title="Chat with Open Large Language Models", ++ title="ChatBot based Xeon-W & Arc GPUs", + theme=gr.themes.Default(), + css=block_css, + ) as demo: diff --git a/docker/llm/serving/xpu/docker/oneccl-binding.patch b/docker/llm/serving/xpu/docker/oneccl-binding.patch new file mode 100644 index 00000000000..4b8410dce9d --- /dev/null +++ b/docker/llm/serving/xpu/docker/oneccl-binding.patch @@ -0,0 +1,14 @@ +diff --git a/src/gpu/dpcpp_ccl.cpp b/src/gpu/dpcpp_ccl.cpp +index 3bd8087..c5b5ce3 100644 +--- a/src/gpu/dpcpp_ccl.cpp ++++ b/src/gpu/dpcpp_ccl.cpp +@@ -689,7 +689,8 @@ c10::intrusive_ptr XPUCCLStubs::allreduce_(std::v + stream, + attr), stream.get_native()); + }); +- // printf("Use One CCL allreduce.\n"); ++ stream.get_native().wait(); ++ // printf("Use One CCL allreduce.\n"); + return ret_evt; + }, + c10d::OpType::ALLREDUCE); diff --git a/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh b/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh index b51e4fc3e13..86e9d56f943 100644 --- a/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh +++ b/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh @@ -1,4 +1,5 @@ cd /llm/lightweight_serving model_path="/llm/models/Llama-2-7b-chat-hf" low_bit="sym_int4" +source /opt/intel/1ccl-wks/setvars.sh python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit \ No newline at end of file diff --git a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh index b6be00cf4ad..588f2922285 100644 --- a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh +++ b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh @@ -6,7 +6,8 @@ export OMP_NUM_THREADS=32 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so basekit_root=/opt/intel/oneapi source $basekit_root/setvars.sh --force -source $basekit_root/ccl/latest/env/vars.sh --force +# source $basekit_root/ccl/latest/env/vars.sh --force +source /opt/intel/1ccl-wks/setvars.sh export USE_XETLA=OFF if [[ $KERNEL_VERSION != *"6.5"* ]]; then diff --git a/docker/llm/serving/xpu/docker/start-vllm-service.sh b/docker/llm/serving/xpu/docker/start-vllm-service.sh index 7cc409efb5f..c0d0f112c41 100644 --- a/docker/llm/serving/xpu/docker/start-vllm-service.sh +++ b/docker/llm/serving/xpu/docker/start-vllm-service.sh @@ -2,7 +2,8 @@ model="YOUR_MODEL_PATH" served_model_name="YOUR_MODEL_NAME" - +source /opt/intel/1ccl-wks/setvars.sh + python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \ --served-model-name $served_model_name \ --port 8000 \