diff --git a/docker/llm/serving/xpu/docker/README.md b/docker/llm/serving/xpu/docker/README.md index cab1942a554..4a4b07ab953 100644 --- a/docker/llm/serving/xpu/docker/README.md +++ b/docker/llm/serving/xpu/docker/README.md @@ -23,7 +23,6 @@ export DOCKER_IMAGE=intelanalytics/ipex-llm-serving-xpu:2.1.0-SNAPSHOT sudo docker run -itd \ --net=host \ --device=/dev/dri \ - --memory="32G" \ --name=CONTAINER_NAME \ --shm-size="16g" \ $DOCKER_IMAGE diff --git a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh index 9e3333aa766..b6be00cf4ad 100644 --- a/docker/llm/serving/xpu/docker/start-pp_serving-service.sh +++ b/docker/llm/serving/xpu/docker/start-pp_serving-service.sh @@ -14,11 +14,15 @@ if [[ $KERNEL_VERSION != *"6.5"* ]]; then fi export TORCH_LLM_ALLREDUCE=0 +export IPEX_LLM_LAST_LM_HEAD=1 export IPEX_LLM_QUANTIZE_KV_CACHE=1 +export IPEX_LLM_LOW_MEM=1 export num_gpus=2 export model_path="/llm/models/Llama-2-7b-chat-hf" export low_bit="fp8" # max requests = max_num_reqs * rank_num export max_num_seqs="4" +export max_prefilled_seqs="0" + cd /llm/pp_serving -CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $num_gpus pipeline_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit --max-num-seqs $max_num_seqs +CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $num_gpus pipeline_serving.py --repo-id-or-model-path $model_path --low-bit $low_bit --max-num-seqs $max_num_seqs --max-prefilled-seqs $max_prefilled_seqs diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh index 91bf6161bff..14d5a3e6e33 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh @@ -28,9 +28,16 @@ if [[ $KERNEL_VERSION != *"6.5"* ]]; then export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 fi export TORCH_LLM_ALLREDUCE=0 +export IPEX_LLM_QUANTIZE_KV_CACHE=1 +export IPEX_LLM_LAST_LM_HEAD=1 +export IPEX_LLM_LOW_MEM=1 export MODEL_PATH=YOUR_MODEL_PATH export NUM_GPUS=2 -export IPEX_LLM_QUANTIZE_KV_CACHE=1 +export ZE_AFFINITY_MASK=0,1 +export LOW_BIT="fp8" +# max requests = max_num_reqs * rank_num +export MAX_NUM_SEQS="4" +export MAX_PREFILLED_SEQS=0 -CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit fp8 --max-num-seqs 4 --max-prefilled-seqs 0 +CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit $LOW_BIT --max-num-seqs $MAX_NUM_SEQS --max-prefilled-seqs $MAX_PREFILLED_SEQS diff --git a/python/llm/src/ipex_llm/transformers/pipeline_parallel.py b/python/llm/src/ipex_llm/transformers/pipeline_parallel.py index de5e5f56df8..8ff87af28e4 100644 --- a/python/llm/src/ipex_llm/transformers/pipeline_parallel.py +++ b/python/llm/src/ipex_llm/transformers/pipeline_parallel.py @@ -447,6 +447,7 @@ def load_model(self, model_path, world_size, low_bit='sym_int4'): model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, torch_dtype=self.dtype, + cpu_embedding=True, optimize_model=True, trust_remote_code=True, use_cache=True, @@ -499,7 +500,8 @@ def cat_kv_cache(self, model_type, kv_cache_1, kv_cache_2): return tuple(result) else: # num_layers = self.model.layer_end - self.model.layer_start - for layer_idx in range(self.model.num_layers): + num_cache = min(len(kv_cache_1.key_cache), self.model.num_layers) + for layer_idx in range(num_cache): kv_cache_1.key_cache[layer_idx] = \ torch.cat([kv_cache_1.key_cache[layer_idx], kv_cache_2.key_cache[layer_idx]], dim=0)