vllm-project · youkaichao · Apr 7, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
@@ -21,6 +21,8 @@ This document provides a high-level guide on integrating a `HuggingFace Transfor
 Start by forking our `GitHub`_ repository and then :ref:`build it from source <build_from_source>`.
 This gives you the ability to modify the codebase and test your model.
 
+.. tip::
+    If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below.
 
 1. Bring your model code
 ------------------------
@@ -94,3 +96,16 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 ----------------------
 
 Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py>`_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/model_loader.py>`_.
+
+6. Out-of-Tree Model Integration
+--------------------------------------------
+
+We also provide a way to integrate a model without modifying the vLLM codebase. Step 2, 3, 4 are still required, but you can skip step 1 and 5.
+
+Just add the following lines in your code:
+
+.. code-block:: python
+
+    from vllm.model_executor.models import ModelRegistry
+    from your_code import YourModelForCausalLM
+    ModelRegistry.register_out_of_tree_model("YourModelForCausalLM", YourModelForCausalLM)
@@ -0,0 +1,34 @@
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.models.opt import OPTForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+
+class MyOPTForCausalLM(OPTForCausalLM):
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        # this dummy model always predicts the first token
+        logits = super().compute_logits(hidden_states, sampling_metadata)
+        logits.zero_()
+        logits[:, 0] += 1.0
+        return logits
+
+
+def test_oot_registration():
+    # register our dummy model
+    ModelRegistry.register_out_of_tree_model("OPTForCausalLM",
+                                             MyOPTForCausalLM)
+    prompts = ["Hello, my name is", "The text does not matter"]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(model="facebook/opt-125m")
+    first_token = llm.get_tokenizer().decode(0)
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        # make sure only the first token is generated
+        rest = generated_text.replace(first_token, "")
+        assert rest == ""
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -1,5 +1,5 @@
 import importlib
-from typing import List, Optional, Type
+from typing import Dict, List, Optional, Type
 
 import torch.nn as nn
 
@@ -55,6 +55,10 @@
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
 }
 
+# Architecture -> type.
+# out of tree models
+_OOT_MODELS: Dict[str, Type[nn.Module]] = {}
+
 # Models not supported by ROCm.
 _ROCM_UNSUPPORTED_MODELS = []
 
@@ -74,6 +78,8 @@ class ModelRegistry:
 
     @staticmethod
     def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
+        if model_arch in _OOT_MODELS:
+            return _OOT_MODELS[model_arch]
         if model_arch not in _MODELS:
             return None
         if is_hip():
@@ -95,6 +101,12 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
     def get_supported_archs() -> List[str]:
         return list(_MODELS.keys())
 
+    @staticmethod
+    def register_out_of_tree_model(model_arch: str,
+                                   model_cls: Type[nn.Module]):
+        global _OOT_MODELS
+        _OOT_MODELS[model_arch] = model_cls
+
 
 __all__ = [
     "ModelRegistry",