huggingface · BenjaminBossan · Apr 19, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -98,6 +98,8 @@
       title: Prefix tuning
     - local: package_reference/prompt_tuning
       title: Prompt tuning
+    - local: package_reference/vera
+      title: VeRA
     title: Adapters
   - sections:
     - local: package_reference/merge_utils

diff --git a/docs/source/package_reference/vera.md b/docs/source/package_reference/vera.md
@@ -0,0 +1,41 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# VeRA: Vector-based Random Matrix Adaptation
+
+[VeRA](https://huggingface.co/papers/2310.11454) is a parameter-efficient fine-tuning technique that is similar to LoRA but requires even fewer extra parameters while promising similar or even better performance. As such, it is particularly useful when the parameter budget is very limited, e.g. when scaling to very large models. The reduction of the count of trainable parameters is achieved by sharing the same low-rank matrices across all layers, and only training two additional vectors per layer.
+
+When saving the adapter parameters, it's possible to eschew storing the low rank matrices by setting `save_projection=False` on the `VeraConfig`. In that case, these matrices will be restored based on the fixed random seed from the `projection_prng_key` argument. This cuts down on the size of the checkpoint, but we cannot guarantee reproducibility on all devices and for all future versions of PyTorch. If you want to ensure reproducibility, set `save_projection=True` (which is the default).
+
+VeRA currently has the following constraints:
+
+- All targeted parameters must have the same shape.
+- Only `nn.Linear` layers are supported.
+- Quantized layers are not supported.
+
+If these constraints don't work for your use case, use LoRA instead.
+
+The abstract from the paper is:
+
+> Low-rank adapation (LoRA) is a popular method that reduces the number of trainable parameters when finetuning large language models, but still faces acute storage challenges when scaling to even larger models or deploying numerous per-user or per-task adapted models. In this work, we present Vector-based Random Matrix Adaptation (VeRA), which significantly reduces the number of trainable parameters compared to LoRA, yet maintains the same performance. It achieves this by using a single pair of low-rank matrices shared across all layers and learning small scaling vectors instead. We demonstrate its effectiveness on the GLUE and E2E benchmarks, image classification tasks, and show its application in instruction-tuning of 7B and 13B language models.
+
+## VeRAConfig
+
+[[autodoc]] tuners.vera.config.VeraConfig
+
+## VeRAModel
+
+[[autodoc]] tuners.vera.model.VeraModel
diff --git a/examples/sequence_classification/VeRA.ipynb b/examples/sequence_classification/VeRA.ipynb
diff --git a/src/peft/__init__.py b/src/peft/__init__.py
@@ -73,6 +73,8 @@
     OFTModel,
     PolyConfig,
     PolyModel,
+    VeraConfig,
+    VeraModel,
 )
 from .utils import (
     TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,

diff --git a/src/peft/mapping.py b/src/peft/mapping.py
@@ -49,15 +49,18 @@
     PrefixTuningConfig,
     PromptEncoderConfig,
     PromptTuningConfig,
+    VeraConfig,
+    VeraModel,
 )
+from .tuners.tuners_utils import BaseTuner as _BaseTuner
 from .utils import _prepare_prompt_learning_config
 
 
 if TYPE_CHECKING:
     from transformers import PreTrainedModel
 
 
-MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, PeftModel] = {
+MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, type[PeftModel]] = {
     "SEQ_CLS": PeftModelForSequenceClassification,
     "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM,
     "CAUSAL_LM": PeftModelForCausalLM,
@@ -66,7 +69,7 @@
     "FEATURE_EXTRACTION": PeftModelForFeatureExtraction,
 }
 
-PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, PeftConfig] = {
+PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, type[PeftConfig]] = {
     "ADAPTION_PROMPT": AdaptionPromptConfig,
     "PROMPT_TUNING": PromptTuningConfig,
     "PREFIX_TUNING": PrefixTuningConfig,
@@ -79,16 +82,18 @@
     "MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig,
     "OFT": OFTConfig,
     "POLY": PolyConfig,
+    "VERA": VeraConfig,
 }
 
-PEFT_TYPE_TO_TUNER_MAPPING = {
+PEFT_TYPE_TO_TUNER_MAPPING: dict[str, type[_BaseTuner]] = {
     "LORA": LoraModel,
     "LOHA": LoHaModel,
     "LOKR": LoKrModel,
     "ADALORA": AdaLoraModel,
     "IA3": IA3Model,
     "OFT": OFTModel,
     "POLY": PolyModel,
+    "VERA": VeraModel,
 }
 
 

diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py
@@ -50,6 +50,7 @@
     PrefixEncoder,
     PromptEmbedding,
     PromptEncoder,
+    VeraModel,
 )
 from .utils import (
     SAFETENSORS_WEIGHTS_NAME,
@@ -82,6 +83,7 @@
     PeftType.IA3: IA3Model,
     PeftType.OFT: OFTModel,
     PeftType.POLY: PolyModel,
+    PeftType.VERA: VeraModel,
 }
 
 

diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py
@@ -30,3 +30,4 @@
 from .oft import OFTConfig, OFTModel
 from .mixed import MixedModel
 from .poly import PolyConfig, PolyModel
+from .vera import VeraConfig, VeraModel
diff --git a/src/peft/tuners/tuners_utils.py b/src/peft/tuners/tuners_utils.py
@@ -145,6 +145,7 @@ def __init__(self, model, peft_config: Union[PeftConfig, dict[str, PeftConfig]],
                 self.peft_config.update(peft_config)
 
         self.active_adapter = adapter_name
+        self._pre_injection_hook(self.model, self.peft_config[adapter_name], adapter_name)
         self.inject_adapter(self.model, adapter_name)
 
         # Copy the peft_config in the injected model.
@@ -160,6 +161,21 @@ def active_adapters(self) -> list[str]:
     def forward(self, *args: Any, **kwargs: Any):
         return self.model.forward(*args, **kwargs)
 
+    def _pre_injection_hook(self, model: nn.Module, config: PeftConfig, adapter_name: str) -> None:
+        r"""
+        A hook to be called before the adapter is injected into the model. This method can be overridden by child
+        classes to perform any pre-injection operations.
+
+        Args:
+            model (`nn.Module`):
+                The model to be adapted.
+            config (`PeftConfig`):
+                The adapter config.
+            adapter_name (`str`):
+                The adapter name.
+        """
+        pass
+
     @abstractmethod
     def _prepare_adapter_config(self, peft_config: PeftConfig, model_config: dict) -> PeftConfig:
         r"""
@@ -398,9 +414,9 @@ class BaseTunerLayer(ABC):
     active_adapter = None
 
     # All names of layers that may contain adapter (trainable) weights
-    adapter_layer_names: tuple[str] = ()
+    adapter_layer_names: tuple[str, ...] = ()
     # All names of other parameters that may contain adapter-related parameters
-    other_param_names: tuple[str] = ()
+    other_param_names: tuple[str, ...] = ()
 
     # indicates whether all adapters should be disabled
     _disable_adapters: bool = False

diff --git a/src/peft/tuners/vera/__init__.py b/src/peft/tuners/vera/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import VeraConfig
+from .layer import Linear, VeraLayer
+from .model import VeraModel
+
+
+__all__ = ["VeraConfig", "VeraLayer", "Linear", "VeraModel"]
diff --git a/src/peft/tuners/vera/buffer_dict.py b/src/peft/tuners/vera/buffer_dict.py
@@ -0,0 +1,160 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Adapted from https://botorch.org/api/_modules/botorch/utils/torch.html
+
+# TODO: To be removed once (if) https://github.com/pytorch/pytorch/pull/37385 lands
+
+from __future__ import annotations
+
+import collections
+from collections import OrderedDict
+
+import torch
+from torch.nn import Module
+
+
+class BufferDict(Module):
+    r"""
+    Holds buffers in a dictionary.
+
+    BufferDict can be indexed like a regular Python dictionary, but buffers it contains are properly registered, and
+    will be visible by all Module methods. `torch.nn.BufferDict` is an **ordered** dictionary that respects
+
+    * the order of insertion, and
+    * in `torch.nn.BufferDict.update`, the order of the merged `OrderedDict` or another `torch.nn.BufferDict` (the
+      argument to `torch.nn.BufferDict.update`).
+
+    Note that `torch.nn.BufferDict.update` with other unordered mapping types (e.g., Python's plain `dict`) does not
+    preserve the order of the merged mapping.
+
+    Args:
+        buffers (iterable, optional):
+            a mapping (dictionary) of (string : `torch.Tensor`) or an iterable of key-value pairs of type (string,
+            `torch.Tensor`)
+
+    ```python
+    class MyModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.buffers = nn.BufferDict({"left": torch.randn(5, 10), "right": torch.randn(5, 10)})
+
+        def forward(self, x, choice):
+            x = self.buffers[choice].mm(x)
+            return x
+    ```
+    """
+
+    def __init__(self, buffers=None, persistent: bool = False):
+        r"""
+        Args:
+            buffers (`dict`):
+                A mapping (dictionary) from string to `torch.Tensor`, or an iterable of key-value pairs of type
+                (string, `torch.Tensor`).
+        """
+        super().__init__()
+        if buffers is not None:
+            self.update(buffers)
+
+        self.persistent = persistent
+
+    def __getitem__(self, key):
+        return self._buffers[key]
+
+    def __setitem__(self, key, buffer):
+        self.register_buffer(key, buffer, persistent=self.persistent)
+
+    def __delitem__(self, key):
+        del self._buffers[key]
+
+    def __len__(self):
+        return len(self._buffers)
+
+    def __iter__(self):
+        return iter(self._buffers.keys())
+
+    def __contains__(self, key):
+        return key in self._buffers
+
+    def clear(self):
+        """Remove all items from the BufferDict."""
+        self._buffers.clear()
+
+    def pop(self, key):
+        r"""Remove key from the BufferDict and return its buffer.
+
+        Args:
+            key (`str`):
+                Key to pop from the BufferDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self):
+        r"""Return an iterable of the BufferDict keys."""
+        return self._buffers.keys()
+
+    def items(self):
+        r"""Return an iterable of the BufferDict key/value pairs."""
+        return self._buffers.items()
+
+    def values(self):
+        r"""Return an iterable of the BufferDict values."""
+        return self._buffers.values()
+
+    def update(self, buffers):
+        r"""
+        Update the `torch.nn.BufferDict` with the key-value pairs from a mapping or an iterable, overwriting existing
+        keys.
+
+        Note:
+            If `buffers` is an `OrderedDict`, a `torch.nn.BufferDict`, or an iterable of key-value pairs, the order of
+            new elements in it is preserved.
+
+        Args:
+            buffers (iterable):
+                a mapping (dictionary) from string to `torch.Tensor`, or an iterable of key-value pairs of type
+                (string, `torch.Tensor`).
+        """
+        if not isinstance(buffers, collections.abc.Iterable):
+            raise TypeError(
+                "BuffersDict.update should be called with an "
+                "iterable of key/value pairs, but got " + type(buffers).__name__
+            )
+
+        if isinstance(buffers, collections.abc.Mapping):
+            if isinstance(buffers, (OrderedDict, BufferDict)):
+                for key, buffer in buffers.items():
+                    self[key] = buffer
+            else:
+                for key, buffer in sorted(buffers.items()):
+                    self[key] = buffer
+        else:
+            for j, p in enumerate(buffers):
+                if not isinstance(p, collections.abc.Iterable):
+                    raise TypeError(
+                        "BufferDict update sequence element "
+                        "#" + str(j) + " should be Iterable; is" + type(p).__name__
+                    )
+                if not len(p) == 2:
+                    raise ValueError(
+                        "BufferDict update sequence element "
+                        "#" + str(j) + " has length " + str(len(p)) + "; 2 is required"
+                    )
+                self[p[0]] = p[1]
+
+    def extra_repr(self):
+        child_lines = []
+        for k, p in self._buffers.items():
+            size_str = "x".join(str(size) for size in p.size())
+            device_str = "" if not p.is_cuda else f" (GPU {p.get_device()})"
+            parastr = f"Buffer containing: [{torch.typename(p)} of size {size_str}{device_str}]"
+            child_lines.append("  (" + k + "): " + parastr)
+        tmpstr = "\n".join(child_lines)
+        return tmpstr
+
+    def __call__(self, input):
+        raise RuntimeError("BufferDict should not be called.")