diff --git a/.github/workflows/manually_build.yml b/.github/workflows/manually_build.yml index 0d3a01b3938..e0b8db8a4bd 100644 --- a/.github/workflows/manually_build.yml +++ b/.github/workflows/manually_build.yml @@ -10,13 +10,14 @@ on: type: choice options: - all - - ipex-llm-finetune-lora-cpu - - ipex-llm-finetune-qlora-cpu - - ipex-llm-finetune-qlora-xpu - - ipex-llm-xpu - ipex-llm-cpu - - ipex-llm-serving-xpu + - ipex-llm-xpu - ipex-llm-serving-cpu + - ipex-llm-serving-xpu + - ipex-llm-finetune-lora-cpu + - ipex-llm-finetune-qlora-cpu-standalone + - ipex-llm-finetune-qlora-cpu-k8s + - ipex-llm-finetune-qlora-xpu tag: description: 'docker image tag (e.g. 2.1.0-SNAPSHOT)' required: true @@ -72,8 +73,8 @@ jobs: sudo docker push ${image}:latest sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} ${image}:latest - ipex-llm-finetune-qlora-cpu: - if: ${{ inputs.artifact == 'ipex-llm-finetune-qlora-cpu' || inputs.artifact == 'all' }} + ipex-llm-finetune-qlora-cpu-standalone: + if: ${{ inputs.artifact == 'ipex-llm-finetune-qlora-cpu-standalone' || inputs.artifact == 'all' }} runs-on: [self-hosted, Shire] steps: @@ -81,12 +82,12 @@ jobs: - name: docker login run: | docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD} - - name: ipex-llm-finetune-qlora-cpu + - name: ipex-llm-finetune-qlora-cpu-standalone run: | echo "##############################################################" - echo "####### ipex-llm-finetune-qlora-cpu ########" + echo "####### ipex-llm-finetune-qlora-cpu-standalone ########" echo "##############################################################" - export image=intelanalytics/ipex-llm-finetune-qlora-cpu + export image=intelanalytics/ipex-llm-finetune-qlora-cpu-standalone cd docker/llm/finetune/qlora/cpu/docker sudo docker build \ --no-cache=true \ @@ -102,6 +103,36 @@ jobs: sudo docker push ${image}:latest sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} ${image}:latest + ipex-llm-finetune-qlora-cpu-k8s: + if: ${{ inputs.artifact == 'ipex-llm-finetune-qlora-cpu-k8s' || inputs.artifact == 'all' }} + runs-on: [self-hosted, Shire] + + steps: + - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # actions/checkout@v3 + - name: docker login + run: | + docker login -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD} + - name: ipex-llm-finetune-qlora-cpu-k8s + run: | + echo "##############################################################" + echo "####### ipex-llm-finetune-qlora-cpu-k8s ########" + echo "##############################################################" + export image=intelanalytics/ipex-llm-finetune-qlora-cpu-k8s + cd docker/llm/finetune/qlora/cpu/docker + sudo docker build \ + --no-cache=true \ + --build-arg http_proxy=${HTTP_PROXY} \ + --build-arg https_proxy=${HTTPS_PROXY} \ + --build-arg no_proxy=${NO_PROXY} \ + -t ${image}:${TAG} -f ./Dockerfile.k8s . + sudo docker push ${image}:${TAG} + sudo docker tag ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} + sudo docker push 10.239.45.10/arda/${image}:${TAG} + # tag 'latest' + sudo docker tag ${image}:${TAG} ${image}:latest + sudo docker push ${image}:latest + sudo docker rmi -f ${image}:${TAG} 10.239.45.10/arda/${image}:${TAG} ${image}:latest + ipex-llm-finetune-qlora-xpu: if: ${{ inputs.artifact == 'ipex-llm-finetune-qlora-xpu' || inputs.artifact == 'all' }} runs-on: [self-hosted, Shire] diff --git a/.github/workflows/python-style-check.yml b/.github/workflows/python-style-check.yml new file mode 100644 index 00000000000..24990aa6f35 --- /dev/null +++ b/.github/workflows/python-style-check.yml @@ -0,0 +1,54 @@ +name: Python Style Check + +# Cancel previous runs in the PR when you push new commits +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read + +# Controls when the action will run. +on: + push: + branches: [ main ] + paths: + - 'python/**' + - '.github/workflows/python-style-check.yml' + pull_request: + branches: [ main ] + paths: + - 'python/**' + - '.github/workflows/python-style-check.yml' + # schedule: + # - cron: '0 16 * * *' + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "build" + style-check: + # The type of runner that the job will run on + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.7] + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: pip install pycodestyle + pip install pydocstyle + pip install mypy==0.982 + pip install wheel + + - name: LLM style checking + run: bash python/llm/dev/test/lint-python + diff --git a/docker/llm/finetune/qlora/cpu/docker/README.md b/docker/llm/finetune/qlora/cpu/docker/README.md index 16e6e11da42..c50daa8ddf4 100644 --- a/docker/llm/finetune/qlora/cpu/docker/README.md +++ b/docker/llm/finetune/qlora/cpu/docker/README.md @@ -8,10 +8,10 @@ You can download directly from Dockerhub like: ```bash # For standalone -docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.5.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT # For k8s -docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.5.0-SNAPSHOT +docker pull intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT ``` Or build the image from source: @@ -24,7 +24,7 @@ export HTTPS_PROXY=your_https_proxy docker build \ --build-arg http_proxy=${HTTP_PROXY} \ --build-arg https_proxy=${HTTPS_PROXY} \ - -t intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.5.0-SNAPSHOT \ + -t intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT \ -f ./Dockerfile . # For k8s @@ -34,7 +34,7 @@ export HTTPS_PROXY=your_https_proxy docker build \ --build-arg http_proxy=${HTTP_PROXY} \ --build-arg https_proxy=${HTTPS_PROXY} \ - -t intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.5.0-SNAPSHOT \ + -t intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT \ -f ./Dockerfile.k8s . ``` @@ -55,7 +55,7 @@ docker run -itd \ -e https_proxy=${HTTPS_PROXY} \ -v $BASE_MODE_PATH:/ipex_llm/model \ -v $DATA_PATH:/ipex_llm/data/alpaca-cleaned \ - intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.5.0-SNAPSHOT + intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT ``` The download and mount of base model and data to a docker container demonstrates a standard fine-tuning process. You can skip this step for a quick start, and in this way, the fine-tuning codes will automatically download the needed files: @@ -69,7 +69,7 @@ docker run -itd \ --name=ipex-llm-fintune-qlora-cpu \ -e http_proxy=${HTTP_PROXY} \ -e https_proxy=${HTTPS_PROXY} \ - intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.5.0-SNAPSHOT + intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT ``` However, we do recommend you to handle them manually, because the automatical download can be blocked by Internet access and Huggingface authentication etc. according to different environment, and the manual method allows you to fine-tune in a custom way (with different base model and dataset). @@ -130,7 +130,7 @@ docker run -itd \ -e WORKER_COUNT_DOCKER=your_worker_count \ -v your_downloaded_base_model_path:/ipex_llm/model \ -v your_downloaded_data_path:/ipex_llm/data/alpaca_data_cleaned_archive.json \ - intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.5.0-SNAPSHOT + intelanalytics/ipex-llm-finetune-qlora-cpu-standalone:2.1.0-SNAPSHOT ``` Note that `STANDALONE_DOCKER` is set to **TRUE** here. diff --git a/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml b/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml index 083f6584618..ccb85047960 100644 --- a/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml +++ b/docker/llm/finetune/qlora/cpu/kubernetes/values.yaml @@ -1,4 +1,4 @@ -imageName: intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.5.0-SNAPSHOT +imageName: intelanalytics/ipex-llm-finetune-qlora-cpu-k8s:2.1.0-SNAPSHOT trainerNum: 2 microBatchSize: 8 enableGradientCheckpoint: false # true will save more memory but increase latency diff --git a/python/llm/dev/test/lint-python b/python/llm/dev/test/lint-python index 334e797b682..48bdd4806ee 100755 --- a/python/llm/dev/test/lint-python +++ b/python/llm/dev/test/lint-python @@ -21,7 +21,7 @@ SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" PYTHON_ROOT_DIR="$SCRIPT_DIR/.." echo $PYTHON_ROOT_DIR PATHS_TO_CHECK="$SCRIPT_DIR/../../src" -PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/bigdl/llm/langchain/*,$SCRIPT_DIR/../../src/bigdl/llm/transformers/gguf/models/model_implement/yuan2/*" +PATTERNS_TO_EXCLUDE="__init__.py,log4Error.py,$SCRIPT_DIR/../../src/ipex_llm/langchain/*,$SCRIPT_DIR/../../src/ipex_llm/transformers/gguf/models/model_implement/yuan2/*" PEP8_REPORT_PATH="$PYTHON_ROOT_DIR/test/pep8-report.txt" PYLINT_REPORT_PATH="$PYTHON_ROOT_DIR/test/pylint-report.txt" PYLINT_INSTALL_INFO="$PYTHON_ROOT_DIR/test/pylint-info.txt" diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index 94a163690f0..7c6697f3eae 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -87,6 +87,8 @@ # # Note this format cannot be used directly in IPEX-LLM's mm_int4, which expects # row major but packing two consecutive columns. + + def q4_0_xpu_transpose(ggml_weight, weight_shape): from ipex_llm.transformers.low_bit_linear import get_block_size Q4_0 = get_block_size("sym_int4") diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py index c4410d4ce0e..be3f13a5738 100644 --- a/python/llm/src/ipex_llm/transformers/model.py +++ b/python/llm/src/ipex_llm/transformers/model.py @@ -312,7 +312,7 @@ def from_pretrained(cls, cpu_embedding = kwargs.get("cpu_embedding", False) # for 2bit, default use embedding_quantization if q_k in ["gguf_iq2_xxs", "gguf_iq2_xs", "gguf_iq1_s", "q2_k"] and \ - not cpu_embedding and embedding_qtype is None: + not cpu_embedding and embedding_qtype is None: embedding_qtype = "q2_k" if imatrix_file is not None: imatrix_data = load_imatrix_data(imatrix_file) diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm2.py b/python/llm/src/ipex_llm/transformers/models/chatglm2.py index 31a8624592c..1c0c670a8a9 100644 --- a/python/llm/src/ipex_llm/transformers/models/chatglm2.py +++ b/python/llm/src/ipex_llm/transformers/models/chatglm2.py @@ -227,8 +227,8 @@ def chatglm2_quantized_attention_forward_8eb45c( key_layer = key_layer.transpose(0, 1) query_layer_cur = query_layer[..., :rot_dim] key_layer_cur = key_layer[..., :rot_dim] - # ipex_llm's apply_rotary_embedding can change the origin storage, so query_layer will get - # the result directly. + # ipex_llm's apply_rotary_embedding can change the origin storage, + # so query_layer will get the result directly. torch.ops.torch_ipex.apply_rotary_embedding(query_layer_cur, sin, cos, query_layer_cur) torch.ops.torch_ipex.apply_rotary_embedding(key_layer_cur, sin, cos, key_layer_cur) query_layer = query_layer.transpose(0, 1) @@ -367,8 +367,8 @@ def chatglm2_attention_forward_8eb45c( key_layer = key_layer.transpose(0, 1) query_layer_cur = query_layer[..., :rot_dim] key_layer_cur = key_layer[..., :rot_dim] - # ipex_llm's apply_rotary_embedding can change the origin storage, so query_layer will get - # the result directly. + # ipex_llm's apply_rotary_embedding can change the origin storage, + # so query_layer will get the result directly. torch.ops.torch_ipex.apply_rotary_embedding(query_layer_cur, sin, cos, query_layer_cur) torch.ops.torch_ipex.apply_rotary_embedding(key_layer_cur, sin, cos, key_layer_cur) query_layer = query_layer.transpose(0, 1) diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py index 862e09b1663..25330fe008a 100644 --- a/python/llm/src/ipex_llm/transformers/models/llama.py +++ b/python/llm/src/ipex_llm/transformers/models/llama.py @@ -64,7 +64,7 @@ logger = logging.get_logger(__name__) -def llama_decoding_fast_path_qtype_check(proj): +def llama_decoding_fast_path_qtype_check(proj): # IQ2_XXS only can be used in Llama-like model qtype = getattr(proj, "qtype", None) return qtype in [SYM_INT4, FP8E5, IQ2_XXS, FP4] diff --git a/python/llm/src/ipex_llm/transformers/models/qwen.py b/python/llm/src/ipex_llm/transformers/models/qwen.py index 85ac72ceada..09709136499 100644 --- a/python/llm/src/ipex_llm/transformers/models/qwen.py +++ b/python/llm/src/ipex_llm/transformers/models/qwen.py @@ -136,7 +136,7 @@ def qwen_attention_forward_original( device = hidden_states.device # for flash attention original_dtype = hidden_states.dtype - position_ids = rotary_pos_emb_list[-1] # the last one is posisiton_ids + position_ids = rotary_pos_emb_list[-1] # the last one is posisiton_ids rotary_pos_emb_list = rotary_pos_emb_list[:-1] use_fuse_rope = should_use_fuse_rope(self, hidden_states) @@ -332,7 +332,7 @@ def qwen_attention_forward_quantized( bsz, q_len, _ = hidden_states.size() device = hidden_states.device - position_ids = rotary_pos_emb_list[-1] # the last one is posisiton_ids + position_ids = rotary_pos_emb_list[-1] # the last one is posisiton_ids rotary_pos_emb_list = rotary_pos_emb_list[:-1] use_fuse_rope = should_use_fuse_rope(self, hidden_states) diff --git a/python/llm/src/ipex_llm/transformers/models/utils.py b/python/llm/src/ipex_llm/transformers/models/utils.py index a3c66a0dd5a..2241a9bc434 100644 --- a/python/llm/src/ipex_llm/transformers/models/utils.py +++ b/python/llm/src/ipex_llm/transformers/models/utils.py @@ -29,7 +29,7 @@ GELU = 1 -def decoding_fast_path_qtype_check(proj): +def decoding_fast_path_qtype_check(proj): qtype = getattr(proj, "qtype", None) return qtype in [SYM_INT4, FP8E5, FP4] @@ -91,7 +91,7 @@ def use_quantize_kv_cache(linear: torch.nn.Module, x: torch.Tensor) -> bool: def kv_cache_device_check(x: torch.Tensor) -> bool: return get_xpu_device_type(x) == "mtl" or \ - ((get_xpu_device_type(x) == "arc" or get_xpu_device_type(x) == "flex") and \ + ((get_xpu_device_type(x) == "arc" or get_xpu_device_type(x) == "flex") and 1 < x.size(0) and x.size(0) <= 8) @@ -330,8 +330,8 @@ def use_esimd_sdp(q_len, k_len, head_dim, query_states, attention_mask=None): device_name = torch.xpu.get_device_name(query_states.device.index) if device_name.startswith("Intel(R) Arc(TM) A") or \ - device_name.startswith("Intel(R) Data Center GPU Flex") or \ - device_name.startswith("Intel(R) Data Center GPU Max"): + device_name.startswith("Intel(R) Data Center GPU Flex") or \ + device_name.startswith("Intel(R) Data Center GPU Max"): import linear_fp16_esimd if not hasattr(linear_fp16_esimd, "sdp_forward"): return False diff --git a/python/llm/src/ipex_llm/vllm/core/scheduler.py b/python/llm/src/ipex_llm/vllm/core/scheduler.py index 0667d4e9480..3ebb0a8876a 100644 --- a/python/llm/src/ipex_llm/vllm/core/scheduler.py +++ b/python/llm/src/ipex_llm/vllm/core/scheduler.py @@ -43,7 +43,7 @@ from ipex_llm.vllm.logger import init_logger from ipex_llm.vllm.sequence import SequenceData, SequenceStatus from ipex_llm.vllm.sequence import (Sequence, SequenceGroup, - SequenceGroupMetadata) + SequenceGroupMetadata) from ipex_llm.utils.common import invalidInputError logger = init_logger(__name__) diff --git a/python/llm/src/ipex_llm/vllm/model_executor/layers/bigdl_sampler.py b/python/llm/src/ipex_llm/vllm/model_executor/layers/bigdl_sampler.py index a3675ae6d86..e838d16543d 100644 --- a/python/llm/src/ipex_llm/vllm/model_executor/layers/bigdl_sampler.py +++ b/python/llm/src/ipex_llm/vllm/model_executor/layers/bigdl_sampler.py @@ -43,7 +43,7 @@ from ipex_llm.vllm.sampling_params import SamplingParams, SamplingType from ipex_llm.vllm.sequence import (SamplerOutput, SequenceGroupMetadata, - SequenceData, SequenceOutputs) + SequenceData, SequenceOutputs) import time