Skip to content

Commit

Permalink
Remove non-HF ExLlamaV2 loader (#5431)
Browse files Browse the repository at this point in the history
  • Loading branch information
oobabooga authored Feb 4, 2024
1 parent b6077b0 commit cde000d
Show file tree
Hide file tree
Showing 9 changed files with 18 additions and 220 deletions.
8 changes: 2 additions & 6 deletions modules/LoRA.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
def add_lora_to_model(lora_names):
if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ':
add_lora_autogptq(lora_names)
elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader == ['ExLlamav2', 'ExLlamav2_HF']:
elif shared.model.__class__.__name__ in ['Exllamav2HF'] or shared.args.loader == ['ExLlamav2_HF']:
add_lora_exllamav2(lora_names)
else:
add_lora_transformers(lora_names)
Expand All @@ -39,11 +39,7 @@ def add_lora_exllamav2(lora_names):
shared.model.loras = []
for lora_name in lora_names:
lora_path = get_lora_path(lora_name)
if shared.model.__class__.__name__ == 'Exllamav2Model':
lora = ExLlamaV2Lora.from_directory(shared.model.model, str(lora_path))
else:
lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path))

lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path))
shared.model.loras.append(lora)

shared.lora_names = lora_names
Expand Down
149 changes: 0 additions & 149 deletions modules/exllamav2.py

This file was deleted.

33 changes: 0 additions & 33 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,6 @@
'trust_remote_code',
'no_use_fast',
],
'ExLlamav2': [
'gpu_split',
'max_seq_len',
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'alpha_value',
'compress_pos_emb',
'exllamav2_info',
],
'AutoGPTQ': [
'triton',
'no_inject_fused_attention',
Expand Down Expand Up @@ -204,29 +194,6 @@ def transformers_samplers():
'AutoAWQ': transformers_samplers(),
'QuIP#': transformers_samplers(),
'HQQ': transformers_samplers(),
'ExLlamav2': {
'temperature',
'temperature_last',
'top_p',
'min_p',
'top_k',
'typical_p',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'seed',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'ban_eos_token',
'add_bos_token',
'custom_token_bans',
'skip_special_tokens',
'auto_max_new_tokens',
},
'ExLlamav2_HF': {
'temperature',
'temperature_last',
Expand Down
12 changes: 3 additions & 9 deletions modules/logits.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return
logger.error("No model is loaded! Select one in the Model tab.")
return 'Error: No model is loaded1 Select one in the Model tab.', previous

is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
is_non_hf_llamacpp = shared.model.__class__.__name__ == 'LlamaCppModel'

if use_samplers:
if any([is_non_hf_exllamav2, is_non_hf_llamacpp]):
if is_non_hf_llamacpp:
logger.error("Sampler hijacking is not supported non-Huggingface loaders.")
# sampling is all done in c for exllama, so it is really hard to hijack
# it should be possible to hijack llamacpp sampler by hijacking all their sampling methods,
Expand All @@ -31,20 +30,15 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return

scores = sampler_hijack.global_scores[-1]
else:
if is_non_hf_exllamav2:
if is_torch_xpu_available():
tokens = shared.tokenizer.encode(prompt).to("xpu:0")
else:
tokens = shared.tokenizer.encode(prompt).cuda()
scores = shared.model.get_logits(tokens)[-1][-1]
elif is_non_hf_llamacpp:
if is_non_hf_llamacpp:
tokens = shared.tokenizer.encode(prompt)
scores = shared.model.get_logits(tokens)[-1][-1]
else:
if is_torch_xpu_available():
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0")
else:
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()

output = shared.model(input_ids=tokens)
scores = output['logits'][-1][-1]

Expand Down
8 changes: 0 additions & 8 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def load_model(model_name, loader=None):
'GPTQ-for-LLaMa': GPTQ_loader,
'llama.cpp': llamacpp_loader,
'llamacpp_HF': llamacpp_HF_loader,
'ExLlamav2': ExLlamav2_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ctransformers': ctransformers_loader,
'AutoAWQ': AutoAWQ_loader,
Expand Down Expand Up @@ -376,13 +375,6 @@ def AutoGPTQ_loader(model_name):
return modules.AutoGPTQ_loader.load_quantized(model_name)


def ExLlamav2_loader(model_name):
from modules.exllamav2 import Exllamav2Model

model, tokenizer = Exllamav2Model.from_pretrained(model_name)
return model, tokenizer


def ExLlamav2_HF_loader(model_name):
from modules.exllamav2_hf import Exllamav2HF

Expand Down
2 changes: 2 additions & 0 deletions modules/models_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ def get_model_metadata(model):
if re.match(pat.lower(), model.lower()):
for k in settings[pat]:
model_settings[k] = settings[pat][k]
if k == 'loader' and settings[pat][k] == 'ExLlamav2':
model_settings[k] = 'ExLlamav2_HF'

return model_settings

Expand Down
14 changes: 5 additions & 9 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@

# Model loader
group = parser.add_argument_group('Model loader')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.')

# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
Expand Down Expand Up @@ -130,11 +130,11 @@
group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')

# ExLlama
group = parser.add_argument_group('ExLlama')
# ExLlamaV2
group = parser.add_argument_group('ExLlamaV2')
group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
Expand Down Expand Up @@ -248,11 +248,7 @@ def fix_loader_name(name):
return 'AutoGPTQ'
elif name in ['gptq-for-llama', 'gptqforllama', 'gptqllama', 'gptq for llama', 'gptq_for_llama']:
return 'GPTQ-for-LLaMa'
elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
return 'ExLlama'
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
return 'ExLlamav2'
elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2', 'exllama', 'ex-llama', 'ex_llama', 'exlama', 'exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
return 'ExLlamav2_HF'
elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
return 'ctransformers'
Expand Down
11 changes: 6 additions & 5 deletions modules/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
yield ''
return

if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel']:
generate_func = generate_reply_custom
else:
generate_func = generate_reply_HF
Expand Down Expand Up @@ -120,10 +120,11 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')

if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel']:
input_ids = shared.tokenizer.encode(str(prompt))
if shared.model.__class__.__name__ not in ['Exllamav2Model']:
input_ids = np.array(input_ids).reshape(1, len(input_ids))
# The step below is necessary for llama.cpp, but may not be
# necessary for future loaders.
input_ids = np.array(input_ids).reshape(1, len(input_ids))
else:
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
if not add_bos_token:
Expand All @@ -134,7 +135,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]

if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel'] or shared.args.cpu:
return input_ids
elif shared.args.deepspeed:
return input_ids.to(device=local_rank)
Expand Down
1 change: 0 additions & 1 deletion modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ def create_ui():
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.')
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.')

with gr.Column():
Expand Down

5 comments on commit cde000d

@aikitoria
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove it? This loader provided better performance than the HF one, especially on larger models it was sometimes more than 10% faster.

@M4st3rfun
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why did you delete that? even ExLlamav2_HF doesn't work any more...

@aikitoria
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was already reverted, see the linked issue. What do you mean by it not working anymore? Both loaders appear to work fine for me on the current state of the dev branch.

@M4st3rfun
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First time i updated it normal clicking the update bottom, then i realized that ExLlamav2_HF doesn't work any more, after you told me about the dev banch i tested the Dev branch and doesn't work either, did a clean install, doesn't work either, it says this here ->

Traceback (most recent call last):

File "C:\AI\oo\text-generation-webui\modules\ui_model_menu.py", line 220, in load_model_wrapper

shared.model, shared.tokenizer = load_model(selected_model, loader)

                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "C:\AI\oo\text-generation-webui\modules\models.py", line 87, in load_model

output = load_func_maploader

     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "C:\AI\oo\text-generation-webui\modules\models.py", line 387, in ExLlamav2_HF_loader

from modules.exllamav2_hf import Exllamav2HF
File "C:\AI\oo\text-generation-webui\modules\exllamav2_hf.py", line 7, in

from exllamav2 import (
File "C:\AI\oo\text-generation-webui\installer_files\env\Lib\site-packages\exllamav2_init_.py", line 3, in

from exllamav2.model import ExLlamaV2
File "C:\AI\oo\text-generation-webui\installer_files\env\Lib\site-packages\exllamav2\model.py", line 16, in

from exllamav2.config import ExLlamaV2Config
File "C:\AI\oo\text-generation-webui\installer_files\env\Lib\site-packages\exllamav2\config.py", line 2, in

from exllamav2.fasttensors import STFile
File "C:\AI\oo\text-generation-webui\installer_files\env\Lib\site-packages\exllamav2\fasttensors.py", line 5, in

from exllamav2.ext import exllamav2_ext as ext_c
File "C:\AI\oo\text-generation-webui\installer_files\env\Lib\site-packages\exllamav2\ext.py", line 142, in

exllamav2_ext = load \

            ^^^^^^

File "C:\AI\oo\text-generation-webui\installer_files\env\Lib\site-packages\torch\utils\cpp_extension.py", line 1308, in load

return _jit_compile(

   ^^^^^^^^^^^^^

File "C:\AI\oo\text-generation-webui\installer_files\env\Lib\site-packages\torch\utils\cpp_extension.py", line 1736, in _jit_compile

return _import_module_from_library(name, build_directory, is_python_module)

   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

File "C:\AI\oo\text-generation-webui\installer_files\env\Lib\site-packages\torch\utils\cpp_extension.py", line 2136, in _import_module_from_library

module = importlib.util.module_from_spec(spec)

     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

ImportError: DLL load failed while importing exllamav2_ext: The specified module was not found.

@aikitoria
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I only run it on linux servers, so I've never seen that error before. Probably best to move that to a separate issue on github to get more eyes on it.

Please sign in to comment.