Skip to content

Commit

Permalink
Remove --sdp-attention, --xformers flags (oobabooga#5126)
Browse files Browse the repository at this point in the history
  • Loading branch information
oobabooga authored and PoetOnTheRun committed Feb 22, 2024
1 parent c0c8c02 commit a778ff1
Show file tree
Hide file tree
Showing 4 changed files with 1 addition and 180 deletions.
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,6 @@ List of command-line flags
| `--load-in-8bit` | Load the model with 8-bit precision (using bitsandbytes). |
| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
| `--no-cache` | Set `use_cache` to `False` while generating text. This reduces VRAM usage slightly, but it comes at a performance cost. |
| `--xformers` | Use xformer's memory efficient attention. This is really old and probably doesn't do anything. |
| `--sdp-attention` | Use PyTorch 2.0's SDP attention. Same as above. |
| `--trust-remote-code` | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
| `--no_use_fast` | Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. |
| `--use_flash_attention_2` | Set use_flash_attention_2=True while loading the model. |
Expand Down
171 changes: 0 additions & 171 deletions modules/llama_attn_hijack.py

This file was deleted.

6 changes: 1 addition & 5 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)

import modules.shared as shared
from modules import RoPE, llama_attn_hijack, sampler_hijack
from modules import RoPE, sampler_hijack
from modules.logging_colors import logger
from modules.models_settings import get_model_metadata
from modules.relative_imports import RelativeImport
Expand Down Expand Up @@ -97,10 +97,6 @@ def load_model(model_name, loader=None):
else:
tokenizer = load_tokenizer(model_name, model)

# Hijack attention with xformers
if any((shared.args.xformers, shared.args.sdp_attention)):
llama_attn_hijack.hijack_llama_attention()

shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
if loader.lower().startswith('exllama'):
shared.settings['truncation_length'] = shared.args.max_seq_len
Expand Down
2 changes: 0 additions & 2 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,6 @@
group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
group.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
group.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.')
group.add_argument('--xformers', action='store_true', help='Use xformer\'s memory efficient attention. This is really old and probably doesn\'t do anything.')
group.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.')
Expand Down

0 comments on commit a778ff1

Please sign in to comment.