From 86fc0492f49a40b15198c82e47bba31f42d48f77 Mon Sep 17 00:00:00 2001
From: Guancheng Fu <110874468+gc-fu@users.noreply.github.com>
Date: Fri, 26 Jul 2024 09:38:39 +0800
Subject: [PATCH] Update oneccl used (#11647)

* Add internal oneccl

* fix

* fix

* add oneccl
---
 docker/llm/serving/xpu/docker/Dockerfile      |  33 ++-
 .../xpu/docker/gradio_web_server.patch        | 208 ++++++++++++++++++
 .../serving/xpu/docker/oneccl-binding.patch   |  14 ++
 .../start-lightweight_serving-service.sh      |   1 +
 .../xpu/docker/start-pp_serving-service.sh    |   3 +-
 .../serving/xpu/docker/start-vllm-service.sh  |   3 +-
 6 files changed, 259 insertions(+), 3 deletions(-)
 create mode 100644 docker/llm/serving/xpu/docker/gradio_web_server.patch
 create mode 100644 docker/llm/serving/xpu/docker/oneccl-binding.patch

diff --git a/docker/llm/serving/xpu/docker/Dockerfile b/docker/llm/serving/xpu/docker/Dockerfile
index 7a5d962e32c..2938ce4618b 100644
--- a/docker/llm/serving/xpu/docker/Dockerfile
+++ b/docker/llm/serving/xpu/docker/Dockerfile
@@ -1,3 +1,22 @@
+FROM intelanalytics/ipex-llm-serving-xpu:latest as build
+
+ARG http_proxy
+ARG https_proxy
+
+ADD ./oneccl-binding.patch  /tmp/oneccl-binding.patch
+
+RUN cd /tmp/ && \
+    pip install --upgrade setuptools wheel twine && \
+    pip install "setuptools<70.0.0" && \
+    git clone https://github.com/intel/torch-ccl -b v2.1.100+xpu && \
+    cd torch-ccl && \
+    patch -p1 < /tmp/oneccl-binding.patch && \
+    git submodule sync && \
+    git submodule update --init --recursive && \
+    COMPUTE_BACKEND=dpcpp python setup.py sdist bdist_wheel && \
+    mv /tmp/torch-ccl/dist/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl /tmp/
+
+
 FROM intelanalytics/ipex-llm-xpu:2.1.0-SNAPSHOT
 
 ARG http_proxy
@@ -5,12 +24,15 @@ ARG https_proxy
 
 # Disable pip's cache behavior
 ARG PIP_NO_CACHE_DIR=false
+COPY --from=build /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl /tmp/
+ADD ./gradio_web_server.patch /tmp/gradio_web_server.patch
 
 # Install Serving Dependencies
 # Install ipex-llm[serving] only will update ipex_llm source code without updating
 # bigdl-core-xe, which will lead to problems
 RUN apt-get update && \
     apt-get install -y --no-install-recommends libfabric-dev wrk libaio-dev && \
+    apt-get install -y intel-opencl-icd intel-level-zero-gpu=1.3.26241.33-647~22.04 level-zero level-zero-dev --allow-downgrades && \
     pip install --pre --upgrade ipex-llm[xpu,serving] && \
     pip install transformers==4.37.0 gradio==4.19.2 && \
     # Install vLLM-v2 dependencies
@@ -24,7 +46,16 @@ RUN apt-get update && \
     pip install transformers_stream_generator einops tiktoken && \
     # For pipeline serving support
     pip install mpi4py fastapi uvicorn openai && \
-    pip install gradio # for gradio web UI
+    # for gradio web UI
+    pip install gradio && \
+    # Install internal oneccl && \
+    cd /tmp/ && \
+    wget https://sourceforge.net/projects/oneccl-wks/files/oneccl_wks_installer_2024.0.0.2.sh && \
+    bash oneccl_wks_installer_2024.0.0.2.sh && \
+    pip uninstall -y oneccl_bind_pt && \
+    pip install /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl && \ 
+    rm /tmp/oneccl_bind_pt-2.1.100+xpu-cp311-cp311-linux_x86_64.whl && \
+    patch /usr/local/lib/python3.11/dist-packages/fastchat/serve/gradio_web_server.py < /tmp/gradio_web_server.patch
 
 COPY ./vllm_online_benchmark.py        /llm/
 COPY ./vllm_offline_inference.py       /llm/
diff --git a/docker/llm/serving/xpu/docker/gradio_web_server.patch b/docker/llm/serving/xpu/docker/gradio_web_server.patch
new file mode 100644
index 00000000000..807e0f22231
--- /dev/null
+++ b/docker/llm/serving/xpu/docker/gradio_web_server.patch
@@ -0,0 +1,208 @@
+--- gradio_web_server.py	2024-06-20 14:21:48.013518726 +0800
++++ gradio_web_server_new.py	2024-06-20 14:23:09.822830709 +0800
+@@ -9,8 +9,10 @@
+ import json
+ import os
+ import random
++import pandas as pd
+ import time
+ import uuid
++import numpy as np
+ 
+ import gradio as gr
+ import requests
+@@ -241,7 +243,7 @@
+     ip = get_ip(request)
+     logger.info(f"clear_history. ip: {ip}")
+     state = None
+-    return (state, [], "", None) + (disable_btn,) * 5
++    return (state, [], "", None, "", "", "", "") + (disable_btn,) * 5
+ 
+ 
+ def get_ip(request: gr.Request):
+@@ -354,6 +356,18 @@
+         return None
+ 
+ 
++def handle_latency_metrics(first_token_time, next_token_time):
++    # next token time is a numpy array...
++    # first token time might be None
++    first_token_latency = "None"
++    next_token_latency = "None"
++    if first_token_time is not None:
++        first_token_latency = str(first_token_time * 1000) + " ms"
++    if next_token_time.size > 0:
++        next_token_latency = str(np.mean(next_token_time) * 1000) + " ms"
++    return first_token_latency, next_token_latency
++
++
+ def bot_response(
+     state,
+     temperature,
+@@ -372,7 +386,7 @@
+     if state.skip_next:
+         # This generate call is skipped due to invalid inputs
+         state.skip_next = False
+-        yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
++        yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
+         return
+ 
+     if apply_rate_limit:
+@@ -381,7 +395,7 @@
+             error_msg = RATE_LIMIT_MSG + "\n\n" + ret["reason"]
+             logger.info(f"rate limit reached. ip: {ip}. error_msg: {ret['reason']}")
+             state.conv.update_last_message(error_msg)
+-            yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
++            yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5
+             return
+ 
+     conv, model_name = state.conv, state.model_name
+@@ -404,6 +418,10 @@
+             yield (
+                 state,
+                 state.to_gradio_chatbot(),
++                "None",
++                "None",
++                "None",
++                "None",
+                 disable_btn,
+                 disable_btn,
+                 disable_btn,
+@@ -444,18 +462,32 @@
+         )
+ 
+     conv.update_last_message("▌")
+-    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
++    # We probably need to change this method
++    yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (disable_btn,) * 5
++    prompt_tokens = 0
++    generated_tokens = 0
++    first_token_latency = None
++    next_token_latencies = np.array([])
++    start_time = time.time()
+ 
+     try:
+         for i, data in enumerate(stream_iter):
+             if data["error_code"] == 0:
++                prompt_tokens = data["usage"]["prompt_tokens"]
++                generated_tokens = data["usage"]["completion_tokens"]
+                 output = data["text"].strip()
+                 conv.update_last_message(output + "▌")
+-                yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
++                if first_token_latency is None:
++                    first_token_latency = time.time() - start_time
++                else:
++                    next_token_latencies = np.append(next_token_latencies, time.time() - start_time)
++                start_time = time.time()
++                first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
++                yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (disable_btn,) * 5
+             else:
+                 output = data["text"] + f"\n\n(error_code: {data['error_code']})"
+                 conv.update_last_message(output)
+-                yield (state, state.to_gradio_chatbot()) + (
++                yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
+                     disable_btn,
+                     disable_btn,
+                     disable_btn,
+@@ -465,13 +497,14 @@
+                 return
+         output = data["text"].strip()
+         conv.update_last_message(output)
+-        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
++        first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies)
++        yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (enable_btn,) * 5
+     except requests.exceptions.RequestException as e:
+         conv.update_last_message(
+             f"{SERVER_ERROR_MSG}\n\n"
+             f"(error_code: {ErrorCode.GRADIO_REQUEST_ERROR}, {e})"
+         )
+-        yield (state, state.to_gradio_chatbot()) + (
++        yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
+             disable_btn,
+             disable_btn,
+             disable_btn,
+@@ -484,7 +517,7 @@
+             f"{SERVER_ERROR_MSG}\n\n"
+             f"(error_code: {ErrorCode.GRADIO_STREAM_UNKNOWN_ERROR}, {e})"
+         )
+-        yield (state, state.to_gradio_chatbot()) + (
++        yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (
+             disable_btn,
+             disable_btn,
+             disable_btn,
+@@ -646,7 +679,8 @@
+     )
+ 
+     notice_markdown = f"""
+-# 🏔️ Chat with Open Large Language Models
++# 🏔️ ChatBot based Xeon-W & Arc GPUs
++###         Deployed with IPEX-LLM
+ {promotion}
+ """
+ 
+@@ -691,6 +725,26 @@
+         regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+         clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
+ 
++    with gr.Row():
++        with gr.Column():
++            gr.Markdown("### Performance Metrics")
++            prompt_token = gr.Textbox(
++                label="Prompt token length:",
++                interactive=False,
++            )
++            next_token = gr.Textbox(
++                label="Generated token length:",
++                interactive=False,
++            )
++            first_token_latency = gr.Textbox(
++                interactive=False,
++                label="First token Latency:",
++            )
++            next_token_latency = gr.Textbox(
++                interactive=False,
++                label="Next token Latency:",
++            )
++
+     with gr.Accordion("Parameters", open=False) as parameter_row:
+         temperature = gr.Slider(
+             minimum=0.0,
+@@ -743,9 +797,9 @@
+     ).then(
+         bot_response,
+         [state, temperature, top_p, max_output_tokens],
+-        [state, chatbot] + btn_list,
++        [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
+     )
+-    clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list)
++    clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list)
+ 
+     model_selector.change(
+         clear_history, None, [state, chatbot, textbox, imagebox] + btn_list
+@@ -758,7 +812,7 @@
+     ).then(
+         bot_response,
+         [state, temperature, top_p, max_output_tokens],
+-        [state, chatbot] + btn_list,
++        [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
+     )
+     send_btn.click(
+         add_text,
+@@ -767,7 +821,7 @@
+     ).then(
+         bot_response,
+         [state, temperature, top_p, max_output_tokens],
+-        [state, chatbot] + btn_list,
++        [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list,
+     )
+ 
+     return [state, model_selector]
+@@ -775,7 +829,7 @@
+ 
+ def build_demo(models):
+     with gr.Blocks(
+-        title="Chat with Open Large Language Models",
++        title="ChatBot based Xeon-W & Arc GPUs",
+         theme=gr.themes.Default(),
+         css=block_css,
+     ) as demo:
diff --git a/docker/llm/serving/xpu/docker/oneccl-binding.patch b/docker/llm/serving/xpu/docker/oneccl-binding.patch
new file mode 100644
index 00000000000..4b8410dce9d
--- /dev/null
+++ b/docker/llm/serving/xpu/docker/oneccl-binding.patch
@@ -0,0 +1,14 @@
+diff --git a/src/gpu/dpcpp_ccl.cpp b/src/gpu/dpcpp_ccl.cpp
+index 3bd8087..c5b5ce3 100644
+--- a/src/gpu/dpcpp_ccl.cpp
++++ b/src/gpu/dpcpp_ccl.cpp
+@@ -689,7 +689,8 @@ c10::intrusive_ptr<ProcessGroupCCL::AsyncWorkCCL> XPUCCLStubs::allreduce_(std::v
+                                             stream,
+                                             attr), stream.get_native());
+       });
+-    // printf("Use One CCL allreduce.\n");
++    stream.get_native().wait();
++   // printf("Use One CCL allreduce.\n");
+     return ret_evt;
+   },
+   c10d::OpType::ALLREDUCE);
diff --git a/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh b/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh
index b51e4fc3e13..86e9d56f943 100644
--- a/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh
+++ b/docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh
@@ -1,4 +1,5 @@
 cd /llm/lightweight_serving
 model_path="/llm/models/Llama-2-7b-chat-hf"
 low_bit="sym_int4"
+source /opt/intel/1ccl-wks/setvars.sh
 python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit
\ No newline at end of file
diff --git a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh
index b6be00cf4ad..588f2922285 100644
--- a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh
+++ b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh
@@ -6,7 +6,8 @@ export OMP_NUM_THREADS=32
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 basekit_root=/opt/intel/oneapi
 source $basekit_root/setvars.sh --force
-source $basekit_root/ccl/latest/env/vars.sh --force
+# source $basekit_root/ccl/latest/env/vars.sh --force
+source /opt/intel/1ccl-wks/setvars.sh
 
 export USE_XETLA=OFF
 if [[ $KERNEL_VERSION != *"6.5"* ]]; then
diff --git a/docker/llm/serving/xpu/docker/start-vllm-service.sh b/docker/llm/serving/xpu/docker/start-vllm-service.sh
index 7cc409efb5f..c0d0f112c41 100644
--- a/docker/llm/serving/xpu/docker/start-vllm-service.sh
+++ b/docker/llm/serving/xpu/docker/start-vllm-service.sh
@@ -2,7 +2,8 @@
 model="YOUR_MODEL_PATH"
 served_model_name="YOUR_MODEL_NAME"
  
- 
+source /opt/intel/1ccl-wks/setvars.sh
+
 python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
   --served-model-name $served_model_name \
   --port 8000 \