Skip to content

Commit

Permalink
Remove non-HF ExLlamaV2 loader (oobabooga#5431)
Browse files Browse the repository at this point in the history
  • Loading branch information
oobabooga authored and PoetOnTheRun committed Feb 22, 2024
1 parent 0910f60 commit 4dc6434
Show file tree
Hide file tree
Showing 9 changed files with 18 additions and 220 deletions.
8 changes: 2 additions & 6 deletions modules/LoRA.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
def add_lora_to_model(lora_names):
if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ':
add_lora_autogptq(lora_names)
elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader == ['ExLlamav2', 'ExLlamav2_HF']:
elif shared.model.__class__.__name__ in ['Exllamav2HF'] or shared.args.loader == ['ExLlamav2_HF']:
add_lora_exllamav2(lora_names)
else:
add_lora_transformers(lora_names)
Expand All @@ -39,11 +39,7 @@ def add_lora_exllamav2(lora_names):
shared.model.loras = []
for lora_name in lora_names:
lora_path = get_lora_path(lora_name)
if shared.model.__class__.__name__ == 'Exllamav2Model':
lora = ExLlamaV2Lora.from_directory(shared.model.model, str(lora_path))
else:
lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path))

lora = ExLlamaV2Lora.from_directory(shared.model.ex_model, str(lora_path))
shared.model.loras.append(lora)

shared.lora_names = lora_names
Expand Down
149 changes: 0 additions & 149 deletions modules/exllamav2.py

This file was deleted.

33 changes: 0 additions & 33 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,6 @@
'trust_remote_code',
'no_use_fast',
],
'ExLlamav2': [
'gpu_split',
'max_seq_len',
'no_flash_attn',
'num_experts_per_token',
'cache_8bit',
'alpha_value',
'compress_pos_emb',
'exllamav2_info',
],
'AutoGPTQ': [
'triton',
'no_inject_fused_attention',
Expand Down Expand Up @@ -204,29 +194,6 @@ def transformers_samplers():
'AutoAWQ': transformers_samplers(),
'QuIP#': transformers_samplers(),
'HQQ': transformers_samplers(),
'ExLlamav2': {
'temperature',
'temperature_last',
'top_p',
'min_p',
'top_k',
'typical_p',
'tfs',
'top_a',
'repetition_penalty',
'presence_penalty',
'frequency_penalty',
'repetition_penalty_range',
'seed',
'mirostat_mode',
'mirostat_tau',
'mirostat_eta',
'ban_eos_token',
'add_bos_token',
'custom_token_bans',
'skip_special_tokens',
'auto_max_new_tokens',
},
'ExLlamav2_HF': {
'temperature',
'temperature_last',
Expand Down
12 changes: 3 additions & 9 deletions modules/logits.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return
logger.error("No model is loaded! Select one in the Model tab.")
return 'Error: No model is loaded1 Select one in the Model tab.', previous

is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
is_non_hf_llamacpp = shared.model.__class__.__name__ == 'LlamaCppModel'

if use_samplers:
if any([is_non_hf_exllamav2, is_non_hf_llamacpp]):
if is_non_hf_llamacpp:
logger.error("Sampler hijacking is not supported non-Huggingface loaders.")
# sampling is all done in c for exllama, so it is really hard to hijack
# it should be possible to hijack llamacpp sampler by hijacking all their sampling methods,
Expand All @@ -31,20 +30,15 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return

scores = sampler_hijack.global_scores[-1]
else:
if is_non_hf_exllamav2:
if is_torch_xpu_available():
tokens = shared.tokenizer.encode(prompt).to("xpu:0")
else:
tokens = shared.tokenizer.encode(prompt).cuda()
scores = shared.model.get_logits(tokens)[-1][-1]
elif is_non_hf_llamacpp:
if is_non_hf_llamacpp:
tokens = shared.tokenizer.encode(prompt)
scores = shared.model.get_logits(tokens)[-1][-1]
else:
if is_torch_xpu_available():
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0")
else:
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()

output = shared.model(input_ids=tokens)
scores = output['logits'][-1][-1]

Expand Down
8 changes: 0 additions & 8 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def load_model(model_name, loader=None):
'GPTQ-for-LLaMa': GPTQ_loader,
'llama.cpp': llamacpp_loader,
'llamacpp_HF': llamacpp_HF_loader,
'ExLlamav2': ExLlamav2_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ctransformers': ctransformers_loader,
'AutoAWQ': AutoAWQ_loader,
Expand Down Expand Up @@ -376,13 +375,6 @@ def AutoGPTQ_loader(model_name):
return modules.AutoGPTQ_loader.load_quantized(model_name)


def ExLlamav2_loader(model_name):
from modules.exllamav2 import Exllamav2Model

model, tokenizer = Exllamav2Model.from_pretrained(model_name)
return model, tokenizer


def ExLlamav2_HF_loader(model_name):
from modules.exllamav2_hf import Exllamav2HF

Expand Down
2 changes: 2 additions & 0 deletions modules/models_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ def get_model_metadata(model):
if re.match(pat.lower(), model.lower()):
for k in settings[pat]:
model_settings[k] = settings[pat][k]
if k == 'loader' and settings[pat][k] == 'ExLlamav2':
model_settings[k] = 'ExLlamav2_HF'

return model_settings

Expand Down
14 changes: 5 additions & 9 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@

# Model loader
group = parser.add_argument_group('Model loader')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.')

# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
Expand Down Expand Up @@ -130,11 +130,11 @@
group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')

# ExLlama
group = parser.add_argument_group('ExLlama')
# ExLlamaV2
group = parser.add_argument_group('ExLlamaV2')
group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.')
group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
group.add_argument('--cfg-cache', action='store_true', help='Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.')
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
Expand Down Expand Up @@ -248,11 +248,7 @@ def fix_loader_name(name):
return 'AutoGPTQ'
elif name in ['gptq-for-llama', 'gptqforllama', 'gptqllama', 'gptq for llama', 'gptq_for_llama']:
return 'GPTQ-for-LLaMa'
elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']:
return 'ExLlama'
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
return 'ExLlamav2'
elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2', 'exllama', 'ex-llama', 'ex_llama', 'exlama', 'exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
return 'ExLlamav2_HF'
elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
return 'ctransformers'
Expand Down
11 changes: 6 additions & 5 deletions modules/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
yield ''
return

if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel']:
generate_func = generate_reply_custom
else:
generate_func = generate_reply_HF
Expand Down Expand Up @@ -120,10 +120,11 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')

if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel']:
input_ids = shared.tokenizer.encode(str(prompt))
if shared.model.__class__.__name__ not in ['Exllamav2Model']:
input_ids = np.array(input_ids).reshape(1, len(input_ids))
# The step below is necessary for llama.cpp, but may not be
# necessary for future loaders.
input_ids = np.array(input_ids).reshape(1, len(input_ids))
else:
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
if not add_bos_token:
Expand All @@ -134,7 +135,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]

if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel'] or shared.args.cpu:
return input_ids
elif shared.args.deepspeed:
return input_ids.to(device=local_rank)
Expand Down
1 change: 0 additions & 1 deletion modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ def create_ui():
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.')
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.')

with gr.Column():
Expand Down

0 comments on commit 4dc6434

Please sign in to comment.