Remove --sdp-attention, --xformers flags (oobabooga#5126)

PoetOnTheRun · Feb 22, 2024 · a778ff1 · a778ff1
1 parent c0c8c02
commit a778ff1
Show file tree

Hide file tree

Showing 4 changed files with 1 addition and 180 deletions.
diff --git a/README.md b/README.md
@@ -231,8 +231,6 @@ List of command-line flags
 | `--load-in-8bit`                            | Load the model with 8-bit precision (using bitsandbytes). |
 | `--bf16`                                    | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
 | `--no-cache`                                | Set `use_cache` to `False` while generating text. This reduces VRAM usage slightly, but it comes at a performance cost. |
-| `--xformers`                                | Use xformer's memory efficient attention. This is really old and probably doesn't do anything. |
-| `--sdp-attention`                           | Use PyTorch 2.0's SDP attention. Same as above. |
 | `--trust-remote-code`                       | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
 | `--no_use_fast`                             | Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. |
 | `--use_flash_attention_2`                   | Set use_flash_attention_2=True while loading the model. |

diff --git a/modules/llama_attn_hijack.py b/modules/llama_attn_hijack.py
diff --git a/modules/models.py b/modules/models.py
@@ -21,7 +21,7 @@
 )
 
 import modules.shared as shared
-from modules import RoPE, llama_attn_hijack, sampler_hijack
+from modules import RoPE, sampler_hijack
 from modules.logging_colors import logger
 from modules.models_settings import get_model_metadata
 from modules.relative_imports import RelativeImport
@@ -97,10 +97,6 @@ def load_model(model_name, loader=None):
         else:
             tokenizer = load_tokenizer(model_name, model)
 
-    # Hijack attention with xformers
-    if any((shared.args.xformers, shared.args.sdp_attention)):
-        llama_attn_hijack.hijack_llama_attention()
-
     shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
     if loader.lower().startswith('exllama'):
         shared.settings['truncation_length'] = shared.args.max_seq_len

diff --git a/modules/shared.py b/modules/shared.py
@@ -98,8 +98,6 @@
 group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
 group.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 group.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.')
-group.add_argument('--xformers', action='store_true', help='Use xformer\'s memory efficient attention. This is really old and probably doesn\'t do anything.')
-group.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
 group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
 group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
 group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')