forked from intel-analytics/ipex-llm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update oneccl used (intel-analytics#11647)
* Add internal oneccl * fix * fix * add oneccl
- Loading branch information
Showing
6 changed files
with
259 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
--- gradio_web_server.py 2024-06-20 14:21:48.013518726 +0800 | ||
+++ gradio_web_server_new.py 2024-06-20 14:23:09.822830709 +0800 | ||
@@ -9,8 +9,10 @@ | ||
import json | ||
import os | ||
import random | ||
+import pandas as pd | ||
import time | ||
import uuid | ||
+import numpy as np | ||
|
||
import gradio as gr | ||
import requests | ||
@@ -241,7 +243,7 @@ | ||
ip = get_ip(request) | ||
logger.info(f"clear_history. ip: {ip}") | ||
state = None | ||
- return (state, [], "", None) + (disable_btn,) * 5 | ||
+ return (state, [], "", None, "", "", "", "") + (disable_btn,) * 5 | ||
|
||
|
||
def get_ip(request: gr.Request): | ||
@@ -354,6 +356,18 @@ | ||
return None | ||
|
||
|
||
+def handle_latency_metrics(first_token_time, next_token_time): | ||
+ # next token time is a numpy array... | ||
+ # first token time might be None | ||
+ first_token_latency = "None" | ||
+ next_token_latency = "None" | ||
+ if first_token_time is not None: | ||
+ first_token_latency = str(first_token_time * 1000) + " ms" | ||
+ if next_token_time.size > 0: | ||
+ next_token_latency = str(np.mean(next_token_time) * 1000) + " ms" | ||
+ return first_token_latency, next_token_latency | ||
+ | ||
+ | ||
def bot_response( | ||
state, | ||
temperature, | ||
@@ -372,7 +386,7 @@ | ||
if state.skip_next: | ||
# This generate call is skipped due to invalid inputs | ||
state.skip_next = False | ||
- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5 | ||
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5 | ||
return | ||
|
||
if apply_rate_limit: | ||
@@ -381,7 +395,7 @@ | ||
error_msg = RATE_LIMIT_MSG + "\n\n" + ret["reason"] | ||
logger.info(f"rate limit reached. ip: {ip}. error_msg: {ret['reason']}") | ||
state.conv.update_last_message(error_msg) | ||
- yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5 | ||
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (no_change_btn,) * 5 | ||
return | ||
|
||
conv, model_name = state.conv, state.model_name | ||
@@ -404,6 +418,10 @@ | ||
yield ( | ||
state, | ||
state.to_gradio_chatbot(), | ||
+ "None", | ||
+ "None", | ||
+ "None", | ||
+ "None", | ||
disable_btn, | ||
disable_btn, | ||
disable_btn, | ||
@@ -444,18 +462,32 @@ | ||
) | ||
|
||
conv.update_last_message("▌") | ||
- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5 | ||
+ # We probably need to change this method | ||
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + (disable_btn,) * 5 | ||
+ prompt_tokens = 0 | ||
+ generated_tokens = 0 | ||
+ first_token_latency = None | ||
+ next_token_latencies = np.array([]) | ||
+ start_time = time.time() | ||
|
||
try: | ||
for i, data in enumerate(stream_iter): | ||
if data["error_code"] == 0: | ||
+ prompt_tokens = data["usage"]["prompt_tokens"] | ||
+ generated_tokens = data["usage"]["completion_tokens"] | ||
output = data["text"].strip() | ||
conv.update_last_message(output + "▌") | ||
- yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5 | ||
+ if first_token_latency is None: | ||
+ first_token_latency = time.time() - start_time | ||
+ else: | ||
+ next_token_latencies = np.append(next_token_latencies, time.time() - start_time) | ||
+ start_time = time.time() | ||
+ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies) | ||
+ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (disable_btn,) * 5 | ||
else: | ||
output = data["text"] + f"\n\n(error_code: {data['error_code']})" | ||
conv.update_last_message(output) | ||
- yield (state, state.to_gradio_chatbot()) + ( | ||
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + ( | ||
disable_btn, | ||
disable_btn, | ||
disable_btn, | ||
@@ -465,13 +497,14 @@ | ||
return | ||
output = data["text"].strip() | ||
conv.update_last_message(output) | ||
- yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5 | ||
+ first_latency, next_latency = handle_latency_metrics(first_token_latency, next_token_latencies) | ||
+ yield (state, state.to_gradio_chatbot(), prompt_tokens, generated_tokens, first_latency, next_latency) + (enable_btn,) * 5 | ||
except requests.exceptions.RequestException as e: | ||
conv.update_last_message( | ||
f"{SERVER_ERROR_MSG}\n\n" | ||
f"(error_code: {ErrorCode.GRADIO_REQUEST_ERROR}, {e})" | ||
) | ||
- yield (state, state.to_gradio_chatbot()) + ( | ||
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + ( | ||
disable_btn, | ||
disable_btn, | ||
disable_btn, | ||
@@ -484,7 +517,7 @@ | ||
f"{SERVER_ERROR_MSG}\n\n" | ||
f"(error_code: {ErrorCode.GRADIO_STREAM_UNKNOWN_ERROR}, {e})" | ||
) | ||
- yield (state, state.to_gradio_chatbot()) + ( | ||
+ yield (state, state.to_gradio_chatbot(), "None", "None", "None", "None") + ( | ||
disable_btn, | ||
disable_btn, | ||
disable_btn, | ||
@@ -646,7 +679,8 @@ | ||
) | ||
|
||
notice_markdown = f""" | ||
-# 🏔️ Chat with Open Large Language Models | ||
+# 🏔️ ChatBot based Xeon-W & Arc GPUs | ||
+### Deployed with IPEX-LLM | ||
{promotion} | ||
""" | ||
|
||
@@ -691,6 +725,26 @@ | ||
regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False) | ||
clear_btn = gr.Button(value="🗑️ Clear history", interactive=False) | ||
|
||
+ with gr.Row(): | ||
+ with gr.Column(): | ||
+ gr.Markdown("### Performance Metrics") | ||
+ prompt_token = gr.Textbox( | ||
+ label="Prompt token length:", | ||
+ interactive=False, | ||
+ ) | ||
+ next_token = gr.Textbox( | ||
+ label="Generated token length:", | ||
+ interactive=False, | ||
+ ) | ||
+ first_token_latency = gr.Textbox( | ||
+ interactive=False, | ||
+ label="First token Latency:", | ||
+ ) | ||
+ next_token_latency = gr.Textbox( | ||
+ interactive=False, | ||
+ label="Next token Latency:", | ||
+ ) | ||
+ | ||
with gr.Accordion("Parameters", open=False) as parameter_row: | ||
temperature = gr.Slider( | ||
minimum=0.0, | ||
@@ -743,9 +797,9 @@ | ||
).then( | ||
bot_response, | ||
[state, temperature, top_p, max_output_tokens], | ||
- [state, chatbot] + btn_list, | ||
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list, | ||
) | ||
- clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list) | ||
+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list) | ||
|
||
model_selector.change( | ||
clear_history, None, [state, chatbot, textbox, imagebox] + btn_list | ||
@@ -758,7 +812,7 @@ | ||
).then( | ||
bot_response, | ||
[state, temperature, top_p, max_output_tokens], | ||
- [state, chatbot] + btn_list, | ||
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list, | ||
) | ||
send_btn.click( | ||
add_text, | ||
@@ -767,7 +821,7 @@ | ||
).then( | ||
bot_response, | ||
[state, temperature, top_p, max_output_tokens], | ||
- [state, chatbot] + btn_list, | ||
+ [state, chatbot, prompt_token, next_token, first_token_latency, next_token_latency] + btn_list, | ||
) | ||
|
||
return [state, model_selector] | ||
@@ -775,7 +829,7 @@ | ||
|
||
def build_demo(models): | ||
with gr.Blocks( | ||
- title="Chat with Open Large Language Models", | ||
+ title="ChatBot based Xeon-W & Arc GPUs", | ||
theme=gr.themes.Default(), | ||
css=block_css, | ||
) as demo: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
diff --git a/src/gpu/dpcpp_ccl.cpp b/src/gpu/dpcpp_ccl.cpp | ||
index 3bd8087..c5b5ce3 100644 | ||
--- a/src/gpu/dpcpp_ccl.cpp | ||
+++ b/src/gpu/dpcpp_ccl.cpp | ||
@@ -689,7 +689,8 @@ c10::intrusive_ptr<ProcessGroupCCL::AsyncWorkCCL> XPUCCLStubs::allreduce_(std::v | ||
stream, | ||
attr), stream.get_native()); | ||
}); | ||
- // printf("Use One CCL allreduce.\n"); | ||
+ stream.get_native().wait(); | ||
+ // printf("Use One CCL allreduce.\n"); | ||
return ret_evt; | ||
}, | ||
c10d::OpType::ALLREDUCE); |
1 change: 1 addition & 0 deletions
1
docker/llm/serving/xpu/docker/start-lightweight_serving-service.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
cd /llm/lightweight_serving | ||
model_path="/llm/models/Llama-2-7b-chat-hf" | ||
low_bit="sym_int4" | ||
source /opt/intel/1ccl-wks/setvars.sh | ||
python lightweight_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters