Merge remote-tracking branch 'upstream/main' into main

DTennant · Apr 29, 2024 · 3a091cd · 3a091cd
2 parents da4dbe2 + 7a22b7d
commit 3a091cd
Show file tree

Hide file tree

Showing 20 changed files with 511 additions and 34 deletions.
diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml
@@ -46,6 +46,7 @@ jobs:
           tags: huggingface/peft-cpu
 
       - name: Post to a Slack channel
+        if: always()
         id: slack
         #uses: slackapi/[email protected]
         uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
@@ -102,6 +103,7 @@ jobs:
           tags: huggingface/peft-gpu
 
       - name: Post to a Slack channel
+        if: always()
         id: slack
         #uses: slackapi/[email protected]
         uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
@@ -159,6 +161,7 @@ jobs:
 
 
       - name: Post to a Slack channel
+        if: always()
         id: slack
         #uses: slackapi/[email protected]
         uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
@@ -216,6 +219,64 @@ jobs:
           tags: huggingface/peft-gpu-bnb-latest
 
       - name: Post to a Slack channel
+        if: always()
+        id: slack
+        #uses: slackapi/[email protected]
+        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          channel-id: ${{ env.CI_SLACK_CHANNEL }}
+          # For posting a rich message using Block Kit
+          payload: |
+            {
+              "text": "peft-gpu + bnb-source (latest) Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
+              "blocks": [
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "peft-gpu + bnb-source (latest) Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
+                  }
+                }
+              ]
+            }
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  latest-cuda-bnb-source-multi:
+    name: "Latest Peft GPU + bnb (multi-backend) source [accelerate / peft / transformers source]"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      - name: Check out code
+        uses: actions/checkout@v3
+      - name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Build and Push GPU
+        uses: docker/build-push-action@v4
+        with:
+          context: ./docker/peft-gpu-bnb-multi-source
+          push: true
+          tags: huggingface/peft-gpu-bnb-multi-source
+
+      - name: Post to a Slack channel
+        if: always()
         id: slack
         #uses: slackapi/[email protected]
         uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001

diff --git a/.github/workflows/nightly-bnb.yml b/.github/workflows/nightly-bnb.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-          docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"]
+          docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest", "huggingface/peft-gpu-bnb-multi-source:latest"]
     runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
     env:
       CUDA_VISIBLE_DEVICES: "0"
@@ -74,7 +74,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"]
+        docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest", "huggingface/peft-gpu-bnb-multi-source:latest"]
     runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
     env:
       CUDA_VISIBLE_DEVICES: "0,1"

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -33,7 +33,7 @@ jobs:
     strategy:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11"]
-        os: ["ubuntu-latest", "macos-latest", "windows-latest"]
+        os: ["ubuntu-latest", "macos-12", "windows-latest"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v3

diff --git a/docker/README.md b/docker/README.md
@@ -0,0 +1,11 @@
+# PEFT Docker images
+
+Here we store all PEFT Docker images used in our testing infrastructure. We use python 3.8 for now on all our images.
+
+- `peft-cpu`: PEFT compiled on CPU with all other HF libraries installed on main branch
+- `peft-gpu`: PEFT complied for NVIDIA GPUs wih all other HF libraries installed on main branch
+- `peft-gpu-bnb-source`: PEFT complied for NVIDIA GPUs with `bitsandbytes` and all other HF libraries installed from main branch
+- `peft-gpu-bnb-latest`: PEFT complied for NVIDIA GPUs with `bitsandbytes` complied from main and all other HF libraries installed from latest PyPi
+- `peft-gpu-bnb-multi-source`: PEFT complied for NVIDIA GPUs with `bitsandbytes` complied from `multi-backend` branch and all other HF libraries installed from main branch
+
+`peft-gpu-bnb-source` and `peft-gpu-bnb-multi-source` are essentially the same, with the only difference being `bitsandbytes` compiled on another branch. Make sure to propagate the changes you applied on one file to the other!
diff --git a/docker/peft-gpu-bnb-multi-source/Dockerfile b/docker/peft-gpu-bnb-multi-source/Dockerfile
@@ -0,0 +1,68 @@
+# Builds GPU docker image of PyTorch
+# Uses multi-staged approach to reduce size
+# Stage 1
+# Use base conda image to reduce time
+FROM continuumio/miniconda3:latest AS compile-image
+# Specify py version
+ENV PYTHON_VERSION=3.8
+# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN apt-get update && \
+    apt-get install -y curl git wget software-properties-common git-lfs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Install audio-related libraries 
+RUN apt-get update && \
+    apt install -y ffmpeg
+
+RUN apt install -y libsndfile1-dev
+RUN git lfs install
+
+# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+# We don't install pytorch here yet since CUDA isn't available
+# instead we use the direct torch wheel
+ENV PATH /opt/conda/envs/peft/bin:$PATH
+# Activate our bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+
+# Stage 2
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image
+COPY --from=compile-image /opt/conda /opt/conda
+ENV PATH /opt/conda/bin:$PATH
+
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget cmake && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Activate the conda env and install transformers + accelerate from source
+# Also clone BNB and build it from source.
+RUN source activate peft && \
+    python3 -m pip install -U --no-cache-dir \
+    librosa \
+    "soundfile>=0.12.1" \
+    scipy \
+    git+https://github.com/huggingface/transformers \
+    git+https://github.com/huggingface/accelerate \
+    peft[test]@git+https://github.com/huggingface/peft \
+    optimum \
+    auto-gptq && \
+    git clone https://github.com/TimDettmers/bitsandbytes && cd bitsandbytes && git checkout multi-backend-refactor && \
+    cmake -B . -DCOMPUTE_BACKEND=cuda -S . && \
+    cmake --build . && \
+    pip install -e . && \ 
+    pip freeze | grep bitsandbytes
+
+RUN echo "source activate peft" >> ~/.profile
+
+# Activate the virtualenv
+CMD ["/bin/bash"]
diff --git a/docker/peft-gpu/Dockerfile b/docker/peft-gpu/Dockerfile
@@ -42,16 +42,20 @@ RUN source activate peft && \
 
 # Add autoawq for quantization testing
 RUN source activate peft && \
-    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.1/autoawq-0.2.1-cp38-cp38-linux_x86_64.whl
+    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.4/autoawq-0.2.4-cp38-cp38-linux_x86_64.whl
 RUN source activate peft && \
-    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.4/autoawq_kernels-0.0.4-cp38-cp38-linux_x86_64.whl
+    python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.6/autoawq_kernels-0.0.6-cp38-cp38-linux_x86_64.whl
 
 # Install apt libs
 RUN apt-get update && \
     apt-get install -y curl git wget && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists*
 
+# Add eetq for quantization testing
+RUN source activate peft && \
+    python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git
+
 # Activate the conda env and install transformers + accelerate from source
 RUN source activate peft && \
     python3 -m pip install -U --no-cache-dir \

diff --git a/docs/source/accelerate/deepspeed.md b/docs/source/accelerate/deepspeed.md
@@ -22,8 +22,6 @@ For DeepSpeed Stage 3 + QLoRA, please refer to the section [Use PEFT QLoRA and D
 
 For confirming these observations, we ran the SFT (Supervised Fine-tuning) [offical example scripts](https://github.com/huggingface/trl/tree/main/examples) of the [Transformers Reinforcement Learning (TRL) library](https://github.com/huggingface/trl) using QLoRA + PEFT and the accelerate configs available [here](https://github.com/huggingface/trl/tree/main/examples/accelerate_configs). We ran these experiments on a 2x NVIDIA T4 GPU.
 
-Note DeepSpeed-Zero3 and `bitsandbytes` are currently **not** compatible.
-
 # Use PEFT and DeepSpeed with ZeRO3 for finetuning large models on multiple devices and multiple nodes
 
 This section of guide will help you learn how to use our DeepSpeed [training script](https://github.com/huggingface/peft/blob/main/examples/sft/train.py) for performing SFT. You'll configure the script to do SFT (supervised fine-tuning) of Llama-70B model with LoRA and ZeRO-3 on 8xH100 80GB GPUs on a single machine. You can configure it to scale to multiple machines by changing the accelerate config.

diff --git a/docs/source/developer_guides/quantization.md b/docs/source/developer_guides/quantization.md
@@ -128,6 +128,42 @@ quantized_model = get_peft_model(quantized_model, peft_config)
 
 You can refer to the [Google Colab](https://colab.research.google.com/drive/12GTp1FCj5_0SnnNQH18h_2XFh9vS_guX?usp=sharing) example for an overview of AQLM+LoRA finetuning.
 
+## EETQ quantization
+
+You can also perform LoRA fine-tuning on EETQ quantized models. [EETQ](https://github.com/NetEase-FuXi/EETQ) package offers simple and efficient way to perform 8-bit quantization, which is claimed to be faster than the `LLM.int8()` algorithm. First, make sure that you have a transformers version that is compatible with EETQ (e.g. by installing it from latest pypi or from source).
+
+```py
+import torch
+from transformers import EetqConfig
+
+config = EetqConfig("int8")
+```
+
+Pass the `config` to the [`~transformers.AutoModelForCausalLM.from_pretrained`] method.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", quantization_config=config)
+```
+
+and create a `LoraConfig` and pass it to `get_peft_model`:
+
+```py
+from peft import LoraConfig, get_peft_model
+
+config = LoraConfig(
+    r=16,
+    lora_alpha=8,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+
+model = get_peft_model(model, config)
+```
+
 ## Next steps
 
 If you're interested in learning more about quantization, the following may be helpful:

diff --git a/src/peft/import_utils.py b/src/peft/import_utils.py
@@ -77,3 +77,8 @@ def is_aqlm_available():
 @lru_cache
 def is_auto_awq_available():
     return importlib.util.find_spec("awq") is not None
+
+
+@lru_cache
+def is_eetq_available():
+    return importlib.util.find_spec("eetq") is not None
diff --git a/src/peft/tuners/ia3/layer.py b/src/peft/tuners/ia3/layer.py
@@ -111,6 +111,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
             if active_adapter in self.ia3_l.keys():
                 base_layer = self.get_base_layer()
                 ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out)
+                orig_dtype = base_layer.weight.data.dtype
                 if safe_merge:
                     orig_weights = base_layer.weight.data
                     orig_weights = torch.mul(orig_weights, ia3_l)
@@ -119,13 +120,14 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = N
                         raise ValueError(
                             f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
                         )
-                    base_layer.weight.data = orig_weights
+                    base_layer.weight.data = orig_weights.to(orig_dtype)
                 else:
-                    base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_l)
+                    base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_l).to(orig_dtype)
 
                 if not self.is_feedforward and (base_layer.bias is not None):
                     scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
-                    base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data)
+                    orig_dtype = base_layer.bias.data.dtype
+                    base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data).to(orig_dtype)
 
                 self.merged_adapters.append(active_adapter)
 
@@ -144,15 +146,16 @@ def unmerge(self) -> None:
                 base_layer = self.get_base_layer()
                 # Add tolerace to avoid division by zero
                 ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out) + 1e-8
-                base_layer.weight.data = torch.div(base_layer.weight.data, ia3_l)
+                orig_dtype = base_layer.weight.data.dtype
+                base_layer.weight.data = torch.div(base_layer.weight.data, ia3_l).to(orig_dtype)
 
                 if not self.is_feedforward and (base_layer.bias is not None):
                     scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
-                    base_layer.bias.data = torch.div(base_layer.bias.data, scaling.data + 1e-8)
+                    orig_dtype = base_layer.bias.data.dtype
+                    base_layer.bias.data = torch.div(base_layer.bias.data, scaling.data + 1e-8).to(orig_dtype)
 
     def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         dtype = previous_dtype = x.dtype
-
         if self.disable_adapters:
             if self.merged:
                 self.unmerge()
@@ -171,13 +174,13 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
                 x = x.to(dtype)
                 # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype
                 # e.g. bf16 vs fp32. Is that okay?
-                interm = (x * ia3_scaling).to(self.get_base_layer().weight.dtype)
+                interm = (x * ia3_scaling).to(previous_dtype)
                 result = self.base_layer(interm, *args, **kwargs)
             else:
                 result = self.base_layer(x, *args, **kwargs)
-                result = result.to(dtype) * ia3_scaling
+                result_dtype = result.dtype
+                result = (result * ia3_scaling).to(result_dtype)
 
-        result = result.to(previous_dtype)
         return result
 
 

diff --git a/src/peft/tuners/lora/__init__.py b/src/peft/tuners/lora/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from peft.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_eetq_available
 
 from .config import LoftQConfig, LoraConfig
 from .gptq import QuantLinear
@@ -34,4 +34,9 @@ def __getattr__(name):
 
         return Linear4bit
 
+    if (name == "EetqLoraLinear") and is_eetq_available():
+        from .eetq import EetqLoraLinear
+
+        return EetqLoraLinear
+
     raise AttributeError(f"module {__name__} has no attribute {name}")