diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md index 70394b6a0ee..c25a75c52cf 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/README.md @@ -52,6 +52,8 @@ pip install oneccl_bind_pt==2.1.100 --extra-index-url https://pytorch-extension. For optimal performance, it is recommended to set several environment variables. We provide example usages as following: +> Note: INT4 optimization is applied to the model by default. You could specify other low bit optimizations (such as 'fp8' and 'fp6') through `--low-bit`. +
diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py index 90d662ac029..8d3ad9a1773 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/generate.py @@ -34,16 +34,18 @@ help='Prompt to infer') parser.add_argument('--n-predict', type=int, default=32, help='Max tokens to predict') + parser.add_argument('--low-bit', type=str, default='sym_int4', help='The quantization type the model will convert to.') parser.add_argument('--gpu-num', type=int, default=2, help='GPU number to use') args = parser.parse_args() model_path = args.repo_id_or_model_path + low_bit = args.low_bit # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format try: model = AutoModelForCausalLM.from_pretrained(model_path, - load_in_4bit=True, + load_in_low_bit=low_bit, optimize_model=True, trust_remote_code=True, use_cache=True, @@ -51,7 +53,7 @@ pipeline_parallel_stages=args.gpu_num) except: model = AutoModel.from_pretrained(model_path, - load_in_4bit=True, + load_in_low_bit=low_bit, optimize_model=True, trust_remote_code=True, use_cache=True, diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_baichuan2_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_baichuan2_arc_2_card.sh index 10eb12eab2a..a7a737f4b83 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_baichuan2_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_baichuan2_arc_2_card.sh @@ -29,8 +29,8 @@ NUM_GPUS=2 # number of used GPU # To run Baichuan2-7B-Chat CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # # To run Baichuan2-13B-Chat # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'baichuan-inc/Baichuan2-13B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh index ab275117364..3fec566feb1 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_chatglm_arc_2_card.sh @@ -28,4 +28,4 @@ export TORCH_LLM_ALLREDUCE=0 NUM_GPUS=2 # number of used GPU # To run chatglm3-6b CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path 'THUDM/chatglm3-6b' --gpu-num $NUM_GPUS --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_codellama_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_codellama_arc_2_card.sh index 3de4c1ca197..8c4b77aa852 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_codellama_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_codellama_arc_2_card.sh @@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU # To run CodeLlama-7b-Instruct-hf CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path 'codellama/CodeLlama-7b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # To run CodeLlama-13b-Instruct-hf # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'codellama/CodeLlama-13b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # To run CodeLlama-34b-Instruct-hf # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'codellama/CodeLlama-34b-Instruct-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh index c6002e6796a..ceedc3db4f1 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_llama_arc_2_card.sh @@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU # To run Llama-2-7b-chat-hf CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path 'meta-llama/Llama-2-7b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # # To run Llama-2-13b-chat-hf # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'meta-llama/Llama-2-13b-chat-hf' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # # To run Meta-Llama-3-8B-Instruct # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'meta-llama/Meta-Llama-3-8B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_mistral_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_mistral_arc_2_card.sh index 0eb36a42785..da4ebbbe369 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_mistral_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_mistral_arc_2_card.sh @@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU # To run Mistral-7B-v0.1 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path 'mistralai/Mistral-7B-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # To run Mixtral-8x7B-Instruct-v0.1 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'mistralai/Mixtral-8x7B-Instruct-v0.1' --gpu-num $NUM_GPUS --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_phi3_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_phi3_arc_2_card.sh index 83f59885981..113c143369b 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_phi3_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_phi3_arc_2_card.sh @@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU # To run Phi-3-medium-4k-instruct CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path 'microsoft/Phi-3-medium-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # # To run Phi-3-mini-4k-instruct # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'microsoft/Phi-3-mini-4k-instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh index 783ee436e5b..33550e6b7ee 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen1.5_arc_2_card.sh @@ -29,20 +29,20 @@ NUM_GPUS=2 # number of used GPU # To run Qwen1.5-7B-Chat CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # # To run Qwen1.5-14B-Chat # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-14B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # # To run Qwen1.5-32B-Chat # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-32B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # # To run Qwen1.5-MoE-A2.7B-Chat # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'Qwen/Qwen1.5-MoE-A2.7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # # To run CodeQwen1.5-7B-Chat # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'Qwen/CodeQwen1.5-7B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen2_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen2_arc_2_card.sh index 2c3e1e1c3b3..9203f664a51 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen2_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_qwen2_arc_2_card.sh @@ -29,4 +29,4 @@ NUM_GPUS=2 # number of used GPU # To run Qwen2-7B-Instruct CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path 'Qwen/Qwen2-7B-Instruct' --gpu-num $NUM_GPUS --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_solar_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_solar_arc_2_card.sh index 76781745f14..8750c6082d1 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_solar_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_solar_arc_2_card.sh @@ -30,4 +30,4 @@ NUM_GPUS=2 # number of used GPU # To run SOLAR-10.7B-Instruct-v1.0 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path 'upstage/SOLAR-10.7B-Instruct-v1.0' --gpu-num $NUM_GPUS --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_vicuna_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_vicuna_arc_2_card.sh index 2f50893ed61..667eb6b6f71 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_vicuna_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_vicuna_arc_2_card.sh @@ -30,12 +30,12 @@ NUM_GPUS=2 # number of used GPU # To run vicuna-7b-v1.3 CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path 'lmsys/vicuna-7b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # To run vicuna-13b-v1.3 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'lmsys/vicuna-13b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # To run vicuna-33b-v1.3 # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path 'lmsys/vicuna-33b-v1.3' --gpu-num $NUM_GPUS --low-bit 'sym_int4' diff --git a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_yi_arc_2_card.sh b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_yi_arc_2_card.sh index ff0a83eb84a..9df3593b219 100644 --- a/python/llm/example/GPU/Pipeline-Parallel-Inference/run_yi_arc_2_card.sh +++ b/python/llm/example/GPU/Pipeline-Parallel-Inference/run_yi_arc_2_card.sh @@ -30,8 +30,8 @@ NUM_GPUS=2 # number of used GPU # To run Yi-6B-Chat CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ - generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS + generate.py --repo-id-or-model-path '01-ai/Yi-6B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4' # To run Yi-34B-Chat # CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS \ -# generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS +# generate.py --repo-id-or-model-path '01-ai/Yi-34B-Chat' --gpu-num $NUM_GPUS --low-bit 'sym_int4'