From c419206ce1ff3c49e1b08398809a3b8c8e1495a8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 Dec 2023 09:59:23 -0800 Subject: [PATCH 01/10] Lint the JS/CSS --- css/html_instruct_style.css | 13 ++++++------- css/main.css | 4 ++-- js/main.js | 26 +++++++++++++------------- js/show_controls.js | 4 ++-- js/update_big_picture.js | 2 +- 5 files changed, 24 insertions(+), 25 deletions(-) diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 130c353a7c..673dad2a11 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -1,13 +1,13 @@ .chat { - background: var(--block-background-fill); - padding: 24px 19px; - padding-right: 19px !important; - padding-top: 0px; - border: 1px solid var(--block-border-color); + background: var(--block-background-fill); + padding: 24px 19px; + padding-right: 19px !important; + padding-top: 0; + border: 1px solid var(--block-border-color); } .chat > .messages { - padding-top: 28px !important; + padding-top: 28px !important; } .message { @@ -62,7 +62,6 @@ .gradio-container .chat .user-message { padding: 20px; background-color: var(--color-accent-soft); - border-radius: 20px; margin-bottom: 12px !important; margin-left: 16px; border-radius: 22px; diff --git a/css/main.css b/css/main.css index fa706ba955..bb944cc9c3 100644 --- a/css/main.css +++ b/css/main.css @@ -92,7 +92,7 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * { .header_bar { background-color: #f7f7f7; box-shadow: 0 2px 3px rgba(22 22 22 / 35%); - margin-bottom: 0px; + margin-bottom: 0; overflow-x: scroll; margin-left: calc(-1 * var(--size-4)); margin-right: calc(-1 * var(--size-4)); @@ -303,7 +303,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-tab { - padding-top: 0px; + padding-top: 0; } #chat-tab button#Generate, #chat-tab button#stop { diff --git a/js/main.js b/js/main.js index 0828dee45b..dd57dde35b 100644 --- a/js/main.js +++ b/js/main.js @@ -178,11 +178,11 @@ for(i = 0; i < noBackgroundelements.length; i++) { noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = "center"; } -const slimDropdownElements = document.querySelectorAll('.slim-dropdown'); +const slimDropdownElements = document.querySelectorAll(".slim-dropdown"); for (i = 0; i < slimDropdownElements.length; i++) { - const parentNode = slimDropdownElements[i].parentNode; - parentNode.style.background = 'transparent'; - parentNode.style.border = '0'; + const parentNode = slimDropdownElements[i].parentNode; + parentNode.style.background = "transparent"; + parentNode.style.border = "0"; } //------------------------------------------------ @@ -313,7 +313,7 @@ function addBigPicture() { } function deleteBigPicture() { - var bigProfilePictures = document.querySelectorAll('.bigProfilePicture'); + var bigProfilePictures = document.querySelectorAll(".bigProfilePicture"); bigProfilePictures.forEach(function (element) { element.parentNode.removeChild(element); }); @@ -337,16 +337,16 @@ let currentChatInputHeight = 0; function updateCssProperties() { // Set the height of the chat area - const chatContainer = document.getElementById('chat').parentNode.parentNode.parentNode; - const chatInputHeight = document.querySelector('#chat-input textarea').clientHeight; + const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode; + const chatInputHeight = document.querySelector("#chat-input textarea").clientHeight; const newChatHeight = `${chatContainer.clientHeight - chatInputHeight + 40}px`; - document.documentElement.style.setProperty('--chat-height', newChatHeight); - document.documentElement.style.setProperty('--input-delta', `${chatInputHeight - 40}px`); + document.documentElement.style.setProperty("--chat-height", newChatHeight); + document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`); // Set the position offset of the chat input box - const header = document.querySelector('.header_bar'); + const header = document.querySelector(".header_bar"); const headerHeight = `${header.clientHeight}px`; - document.documentElement.style.setProperty('--header-height', headerHeight); + document.documentElement.style.setProperty("--header-height", headerHeight); // Offset the scroll position of the chat area if (chatInputHeight !== currentChatInputHeight) { @@ -356,6 +356,6 @@ function updateCssProperties() { } new ResizeObserver(updateCssProperties) - .observe(document.querySelector('#chat-input textarea')); + .observe(document.querySelector("#chat-input textarea")); -window.addEventListener('resize', updateCssProperties); +window.addEventListener("resize", updateCssProperties); diff --git a/js/show_controls.js b/js/show_controls.js index cdd6efc40b..1ff88e52aa 100644 --- a/js/show_controls.js +++ b/js/show_controls.js @@ -12,9 +12,9 @@ function toggle_controls(value) { document.getElementById("chat-col").classList.remove("bigchat"); document.getElementById("chat-tab").style.paddingBottom = ""; - let gallery_element = document.getElementById('gallery-extension'); + let gallery_element = document.getElementById("gallery-extension"); if (gallery_element) { - gallery_element.style.display = 'block'; + gallery_element.style.display = "block"; } } else { diff --git a/js/update_big_picture.js b/js/update_big_picture.js index 1984215aee..4c094776b9 100644 --- a/js/update_big_picture.js +++ b/js/update_big_picture.js @@ -1,5 +1,5 @@ function updateBigPicture() { - var existingElement = document.querySelector('.bigProfilePicture'); + var existingElement = document.querySelector(".bigProfilePicture"); if (existingElement) { var timestamp = new Date().getTime(); existingElement.src = "/file/cache/pfp_character.png?time=" + timestamp; From a4079e879e6a3f5ef2f9ab24475b357542aba373 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 Dec 2023 11:51:55 -0800 Subject: [PATCH 02/10] CSS: don't change --chat-height when outside the chat tab --- js/main.js | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/js/main.js b/js/main.js index dd57dde35b..59c70c7318 100644 --- a/js/main.js +++ b/js/main.js @@ -339,19 +339,21 @@ function updateCssProperties() { // Set the height of the chat area const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode; const chatInputHeight = document.querySelector("#chat-input textarea").clientHeight; - const newChatHeight = `${chatContainer.clientHeight - chatInputHeight + 40}px`; - document.documentElement.style.setProperty("--chat-height", newChatHeight); - document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`); - - // Set the position offset of the chat input box - const header = document.querySelector(".header_bar"); - const headerHeight = `${header.clientHeight}px`; - document.documentElement.style.setProperty("--header-height", headerHeight); - - // Offset the scroll position of the chat area - if (chatInputHeight !== currentChatInputHeight) { - chatContainer.scrollTop += chatInputHeight > currentChatInputHeight ? chatInputHeight : -chatInputHeight; - currentChatInputHeight = chatInputHeight; + if (chatContainer.clientHeight > 0) { + const newChatHeight = `${chatContainer.clientHeight - chatInputHeight + 40}px`; + document.documentElement.style.setProperty("--chat-height", newChatHeight); + document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`); + + // Set the position offset of the chat input box + const header = document.querySelector(".header_bar"); + const headerHeight = `${header.clientHeight}px`; + document.documentElement.style.setProperty("--header-height", headerHeight); + + // Offset the scroll position of the chat area + if (chatInputHeight !== currentChatInputHeight) { + chatContainer.scrollTop += chatInputHeight > currentChatInputHeight ? chatInputHeight : -chatInputHeight; + currentChatInputHeight = chatInputHeight; + } } } From 20a2eaaf95eeb77c2e6b38de1de0f7977a98c21b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 Dec 2023 12:58:07 -0800 Subject: [PATCH 03/10] Add .vs to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index cf47b62836..ca307c4a95 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ venv .envrc .direnv +.vs .vscode *.bak *.ipynb From b7dd1f95427d66b6ecb102058ba29f0d2726e780 Mon Sep 17 00:00:00 2001 From: B611 <35844889+B611@users.noreply.github.com> Date: Sun, 31 Dec 2023 05:34:32 +0100 Subject: [PATCH 04/10] Specify utf-8 encoding for model metadata file open (#5125) --- modules/models_settings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 4e1fb1ad38..919a15bc27 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -35,7 +35,7 @@ def get_model_metadata(model): path = Path(f'{shared.args.model_dir}/{model}/config.json') if path.exists(): - hf_metadata = json.loads(open(path, 'r').read()) + hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read()) else: hf_metadata = None @@ -78,7 +78,7 @@ def get_model_metadata(model): else: # Transformers metadata if hf_metadata is not None: - metadata = json.loads(open(path, 'r').read()) + metadata = json.loads(open(path, 'r', encoding='utf-8').read()) if 'max_position_embeddings' in metadata: model_settings['truncation_length'] = metadata['max_position_embeddings'] model_settings['max_seq_len'] = metadata['max_position_embeddings'] @@ -101,7 +101,7 @@ def get_model_metadata(model): # Read AutoGPTQ metadata path = Path(f'{shared.args.model_dir}/{model}/quantize_config.json') if path.exists(): - metadata = json.loads(open(path, 'r').read()) + metadata = json.loads(open(path, 'r', encoding='utf-8').read()) if 'bits' in metadata: model_settings['wbits'] = metadata['bits'] if 'group_size' in metadata: @@ -112,7 +112,7 @@ def get_model_metadata(model): # Try to find the Jinja instruct template path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json' if path.exists(): - metadata = json.loads(open(path, 'r').read()) + metadata = json.loads(open(path, 'r', encoding='utf-8').read()) if 'chat_template' in metadata: template = metadata['chat_template'] for k in ['eos_token', 'bos_token']: From 8e397915c9bc71aa556c47189b21af2475305e17 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 31 Dec 2023 01:36:51 -0300 Subject: [PATCH 05/10] Remove --sdp-attention, --xformers flags (#5126) --- README.md | 2 - modules/llama_attn_hijack.py | 171 ----------------------------------- modules/models.py | 6 +- modules/shared.py | 2 - 4 files changed, 1 insertion(+), 180 deletions(-) delete mode 100644 modules/llama_attn_hijack.py diff --git a/README.md b/README.md index d0a347c795..15cca711c3 100644 --- a/README.md +++ b/README.md @@ -231,8 +231,6 @@ List of command-line flags | `--load-in-8bit` | Load the model with 8-bit precision (using bitsandbytes). | | `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. | | `--no-cache` | Set `use_cache` to `False` while generating text. This reduces VRAM usage slightly, but it comes at a performance cost. | -| `--xformers` | Use xformer's memory efficient attention. This is really old and probably doesn't do anything. | -| `--sdp-attention` | Use PyTorch 2.0's SDP attention. Same as above. | | `--trust-remote-code` | Set `trust_remote_code=True` while loading the model. Necessary for some models. | | `--no_use_fast` | Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. | | `--use_flash_attention_2` | Set use_flash_attention_2=True while loading the model. | diff --git a/modules/llama_attn_hijack.py b/modules/llama_attn_hijack.py deleted file mode 100644 index 00436fb2ed..0000000000 --- a/modules/llama_attn_hijack.py +++ /dev/null @@ -1,171 +0,0 @@ -import math -import sys -from typing import Optional, Tuple - -import torch -import torch.nn as nn - -import modules.shared as shared -from modules.logging_colors import logger - -if shared.args.xformers: - try: - import xformers.ops - except Exception: - logger.error("xformers not found! Please install it before trying to use it.", file=sys.stderr) - - -def hijack_llama_attention(): - import transformers.models.llama.modeling_llama - if shared.args.xformers: - transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward - logger.info("Replaced attention with xformers_attention") - elif shared.args.sdp_attention: - transformers.models.llama.modeling_llama.LlamaAttention.forward = sdp_attention_forward - logger.info("Replaced attention with sdp_attention") - - -def xformers_forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - # [bsz, nh, t, hd] - - if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - # We only apply xformers optimizations if we don't need to output the whole attention matrix - if not output_attentions: - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros. - # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros. - if attention_mask is None or attention_mask[0, 0, 0, 1] == 0: - # input and output should be of form (bsz, q_len, num_heads, head_dim) - attn_output = xformers.ops.memory_efficient_attention(query_states, key_states, value_states, attn_bias=None) - else: - # input and output should be of form (bsz, q_len, num_heads, head_dim) - attn_output = xformers.ops.memory_efficient_attention(query_states, key_states, value_states, attn_bias=xformers.ops.LowerTriangularMask()) - attn_weights = None - else: - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2) - - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) - return attn_output, attn_weights, past_key_value - - -def sdp_attention_forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - # [bsz, nh, t, hd] - - if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - # We only apply sdp attention if we don't need to output the whole attention matrix - if not output_attentions: - attn_output = torch.nn.functional.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask=attention_mask, is_causal=False) - attn_weights = None - else: - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - return attn_output, attn_weights, past_key_value diff --git a/modules/models.py b/modules/models.py index e166f737a4..5235f10857 100644 --- a/modules/models.py +++ b/modules/models.py @@ -21,7 +21,7 @@ ) import modules.shared as shared -from modules import RoPE, llama_attn_hijack, sampler_hijack +from modules import RoPE, sampler_hijack from modules.logging_colors import logger from modules.models_settings import get_model_metadata from modules.relative_imports import RelativeImport @@ -97,10 +97,6 @@ def load_model(model_name, loader=None): else: tokenizer = load_tokenizer(model_name, model) - # Hijack attention with xformers - if any((shared.args.xformers, shared.args.sdp_attention)): - llama_attn_hijack.hijack_llama_attention() - shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) if loader.lower().startswith('exllama'): shared.settings['truncation_length'] = shared.args.max_seq_len diff --git a/modules/shared.py b/modules/shared.py index f98343b866..36ace23c2d 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -98,8 +98,6 @@ group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).') group.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.') group.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.') -group.add_argument('--xformers', action='store_true', help='Use xformer\'s memory efficient attention. This is really old and probably doesn\'t do anything.') -group.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.') group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.') group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.') From 0e54a09bcbca4bc69dcc1c887f92d3fefe4e5f6a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 31 Dec 2023 01:57:06 -0300 Subject: [PATCH 06/10] Remove exllamav1 loaders (#5128) --- README.md | 17 +-- docs/04 - Model Tab.md | 21 +--- docs/What Works.md | 2 - modules/LoRA.py | 44 ------- modules/exllama.py | 237 ------------------------------------ modules/exllama_hf.py | 174 -------------------------- modules/loaders.py | 76 +----------- modules/logits.py | 7 +- modules/models.py | 15 --- modules/models_settings.py | 10 +- modules/shared.py | 6 +- modules/text_generation.py | 4 +- modules/ui_model_menu.py | 5 +- one_click.py | 21 ---- requirements.txt | 8 -- requirements_amd.txt | 4 - requirements_amd_noavx2.txt | 4 - requirements_noavx2.txt | 8 -- 18 files changed, 28 insertions(+), 635 deletions(-) delete mode 100644 modules/exllama.py delete mode 100644 modules/exllama_hf.py diff --git a/README.md b/README.md index 15cca711c3..5df4d6bc53 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features * 3 interface modes: default (two columns), notebook, and chat. -* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlama](https://github.com/turboderp/exllama), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [CTransformers](https://github.com/marella/ctransformers), [QuIP#](https://github.com/Cornell-RelaxML/quip-sharp). +* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [CTransformers](https://github.com/marella/ctransformers), [QuIP#](https://github.com/Cornell-RelaxML/quip-sharp). * Dropdown menu for quickly switching between different models. * Large number of extensions (built-in and user-contributed), including Coqui TTS for realistic voice outputs, Whisper STT for voice inputs, translation, [multimodal pipelines](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal), vector databases, Stable Diffusion integration, and a lot more. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details. * [Chat with custom characters](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab#character). @@ -140,13 +140,6 @@ Then browse to 3) Manually install AutoGPTQ: [Installation](https://github.com/PanQiWei/AutoGPTQ#install-from-source). * Perform the from-source installation - there are no prebuilt ROCm packages for Windows. -4) Manually install [ExLlama](https://github.com/turboderp/exllama) by simply cloning it into the `repositories` folder (it will be automatically compiled at runtime after that): - -```sh -cd text-generation-webui -git clone https://github.com/turboderp/exllama repositories/exllama -``` - ##### Older NVIDIA GPUs 1) For Kepler GPUs and older, you will need to install CUDA 11.8 instead of 12: @@ -216,7 +209,7 @@ List of command-line flags | Flag | Description | |--------------------------------------------|-------------| -| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlama_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ExLlama, ExLlamav2, ctransformers, QuIP#. | +| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#. | #### Accelerate/transformers @@ -265,13 +258,13 @@ List of command-line flags | `--no_offload_kqv` | Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. | | `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. | -#### ExLlama +#### ExLlamav2 | Flag | Description | |------------------|-------------| |`--gpu-split` | Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. | |`--max_seq_len MAX_SEQ_LEN` | Maximum sequence length. | -|`--cfg-cache` | ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. | +|`--cfg-cache` | ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader. | |`--no_flash_attn` | Force flash-attention to not be used. | |`--cache_8bit` | Use 8-bit cache to save VRAM. | |`--num_experts_per_token NUM_EXPERTS_PER_TOKEN` | Number of experts to use for generation. Applies to MoE models like Mixtral. | @@ -326,7 +319,7 @@ List of command-line flags | `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". | | `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. | -#### RoPE (for llama.cpp, ExLlama, ExLlamaV2, and transformers) +#### RoPE (for llama.cpp, ExLlamaV2, and transformers) | Flag | Description | |------------------|-------------| diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md index 4bbf260e51..2f1e07e653 100644 --- a/docs/04 - Model Tab.md +++ b/docs/04 - Model Tab.md @@ -32,13 +32,14 @@ Options: * **use_flash_attention_2**: Set use_flash_attention_2=True while loading the model. Possibly useful for training. * **disable_exllama**: Only applies when you are loading a GPTQ model through the transformers loader. It needs to be checked if you intend to train LoRAs with the model. -### ExLlama_HF +### ExLlamav2_HF -Loads: GPTQ models. They usually have GPTQ in the model name, or alternatively something like "-4bit-128g" in the name. +Loads: GPTQ and EXL2 models. EXL2 models usually have "EXL2" in the model name, while GPTQ models usually have GPTQ in the model name, or alternatively something like "-4bit-128g" in the name. -Example: https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ +Examples: -ExLlama_HF is the v1 of ExLlama (https://github.com/turboderp/exllama) connected to the transformers library for sampling, tokenizing, and detokenizing. It is very fast and memory-efficient. +* https://huggingface.co/turboderp/Llama2-70B-exl2 +* https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ * **gpu-split**: If you have multiple GPUs, the amount of memory to allocate per GPU should be set in this field. Make sure to set a lower value for the first GPU, as that's where the cache is allocated. * **max_seq_len**: The maximum sequence length for the model. In ExLlama, the cache is preallocated, so the higher this value, the higher the VRAM. It is automatically set to the maximum sequence length for the model based on its metadata, but you may need to lower this value be able to fit the model into your GPU. After loading the model, the "Truncate the prompt up to this length" parameter under "Parameters" > "Generation" is automatically set to your chosen "max_seq_len" so that you don't have to set the same thing twice. @@ -46,18 +47,6 @@ ExLlama_HF is the v1 of ExLlama (https://github.com/turboderp/exllama) connected * **no_flash_attn**: Disables flash attention. Otherwise, it is automatically used as long as the library is installed. * **cache_8bit**: Create a 8-bit precision cache instead of a 16-bit one. This saves VRAM but increases perplexity (I don't know by how much). -### ExLlamav2_HF - -Loads: GPTQ and EXL2 models. EXL2 models usually have "EXL2" in the model name. - -Example: https://huggingface.co/turboderp/Llama2-70B-exl2 - -The parameters are the same as in ExLlama_HF. - -### ExLlama - -The same as ExLlama_HF but using the internal samplers of ExLlama instead of the ones in the Transformers library. - ### ExLlamav2 The same as ExLlamav2_HF but using the internal samplers of ExLlamav2 instead of the ones in the Transformers library. diff --git a/docs/What Works.md b/docs/What Works.md index dba34a808f..4f5defab2a 100644 --- a/docs/What Works.md +++ b/docs/What Works.md @@ -3,9 +3,7 @@ | Loader | Loading 1 LoRA | Loading 2 or more LoRAs | Training LoRAs | Multimodal extension | Perplexity evaluation | |----------------|----------------|-------------------------|----------------|----------------------|-----------------------| | Transformers | ✅ | ✅*** | ✅* | ✅ | ✅ | -| ExLlama_HF | ✅ | ❌ | ❌ | ❌ | ✅ | | ExLlamav2_HF | ✅ | ✅ | ❌ | ❌ | ✅ | -| ExLlama | ✅ | ❌ | ❌ | ❌ | use ExLlama_HF | | ExLlamav2 | ✅ | ✅ | ❌ | ❌ | use ExLlamav2_HF | | AutoGPTQ | ✅ | ❌ | ❌ | ✅ | ✅ | | GPTQ-for-LLaMa | ✅** | ✅*** | ✅ | ✅ | ✅ | diff --git a/modules/LoRA.py b/modules/LoRA.py index be2a7c75f8..2619815c67 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -12,8 +12,6 @@ def add_lora_to_model(lora_names): if 'GPTQForCausalLM' in shared.model.__class__.__name__ or shared.args.loader == 'AutoGPTQ': add_lora_autogptq(lora_names) - elif shared.model.__class__.__name__ in ['ExllamaModel', 'ExllamaHF'] or shared.args.loader == 'ExLlama': - add_lora_exllama(lora_names) elif shared.model.__class__.__name__ in ['Exllamav2Model', 'Exllamav2HF'] or shared.args.loader == ['ExLlamav2', 'ExLlamav2_HF']: add_lora_exllamav2(lora_names) else: @@ -28,48 +26,6 @@ def get_lora_path(lora_name): return Path(f"{shared.args.lora_dir}/{lora_name}") -def add_lora_exllama(lora_names): - - try: - from exllama.lora import ExLlamaLora - except: - try: - from repositories.exllama.lora import ExLlamaLora - except: - logger.error("Could not find the file repositories/exllama/lora.py. Make sure that exllama is cloned inside repositories/ and is up to date.") - return - - if len(lora_names) == 0: - if shared.model.__class__.__name__ == 'ExllamaModel': - shared.model.generator.lora = None - else: - shared.model.lora = None - - shared.lora_names = [] - return - else: - if len(lora_names) > 1: - logger.warning('ExLlama can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.') - - lora_path = get_lora_path(lora_names[0]) - lora_config_path = lora_path / "adapter_config.json" - for file_name in ["adapter_model.safetensors", "adapter_model.bin"]: - file_path = lora_path / file_name - if file_path.is_file(): - lora_adapter_path = file_path - - logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]]))) - if shared.model.__class__.__name__ == 'ExllamaModel': - lora = ExLlamaLora(shared.model.model, str(lora_config_path), str(lora_adapter_path)) - shared.model.generator.lora = lora - else: - lora = ExLlamaLora(shared.model.ex_model, str(lora_config_path), str(lora_adapter_path)) - shared.model.lora = lora - - shared.lora_names = [lora_names[0]] - return - - def add_lora_exllamav2(lora_names): from exllamav2 import ExLlamaV2Lora diff --git a/modules/exllama.py b/modules/exllama.py deleted file mode 100644 index 25c4c99d67..0000000000 --- a/modules/exllama.py +++ /dev/null @@ -1,237 +0,0 @@ -from pathlib import Path - -import torch -import torch.nn.functional as F -from torch import version as torch_version - -from modules import shared -from modules.logging_colors import logger -from modules.models import clear_torch_cache -from modules.text_generation import get_max_prompt_length - -try: - from exllama.generator import ExLlamaGenerator - from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig - from exllama.tokenizer import ExLlamaTokenizer -except: - logger.warning('exllama module failed to import. Will attempt to import from repositories/.') - try: - from modules.relative_imports import RelativeImport - - with RelativeImport("repositories/exllama"): - from generator import ExLlamaGenerator - from model import ExLlama, ExLlamaCache, ExLlamaConfig - from tokenizer import ExLlamaTokenizer - except: - logger.error( - "Could not find repositories/exllama. Please ensure that exllama" - " (https://github.com/turboderp/exllama) is cloned inside repositories/ and is up to date." - ) - raise - - -class ExllamaModel: - def __init__(self): - pass - - @classmethod - def from_pretrained(self, path_to_model): - - path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model) - tokenizer_model_path = path_to_model / "tokenizer.model" - model_config_path = path_to_model / "config.json" - - # Find the model checkpoint - model_path = None - for ext in ['.safetensors', '.pt', '.bin']: - found = list(path_to_model.glob(f"*{ext}")) - if len(found) > 0: - if len(found) > 1: - logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.') - - model_path = found[-1] - break - - config = ExLlamaConfig(str(model_config_path)) - config.model_path = str(model_path) - config.max_seq_len = shared.args.max_seq_len - config.compress_pos_emb = shared.args.compress_pos_emb - if shared.args.gpu_split: - config.set_auto_map(shared.args.gpu_split) - config.gpu_peer_fix = True - - if shared.args.alpha_value > 1 and shared.args.rope_freq_base == 0: - config.alpha_value = shared.args.alpha_value - config.calculate_rotary_embedding_base() - elif shared.args.rope_freq_base > 0: - config.rotary_embedding_base = shared.args.rope_freq_base - - if torch_version.hip: - config.rmsnorm_no_half2 = True - config.rope_no_half2 = True - config.matmul_no_half2 = True - config.silu_no_half2 = True - - model = ExLlama(config) - tokenizer = ExLlamaTokenizer(str(tokenizer_model_path)) - cache = ExLlamaCache(model) - generator = ExLlamaGenerator(model, tokenizer, cache) - - result = self() - result.config = config - result.model = model - result.cache = cache - result.tokenizer = tokenizer - result.generator = generator - return result, result - - def encode(self, string, **kwargs): - return self.tokenizer.encode(string, max_seq_len=self.model.config.max_seq_len, add_bos=True) - - def decode(self, ids, **kwargs): - if isinstance(ids, list): - ids = torch.tensor([ids]) - elif isinstance(ids, torch.Tensor) and ids.numel() == 1: - ids = ids.view(1, -1) - - return self.tokenizer.decode(ids)[0] - - def get_logits(self, token_ids, **kwargs): - self.cache.current_seq_len = 0 - if token_ids.shape[-1] > 1: - self.model.forward(token_ids[:, :-1], self.cache, input_mask=None, preprocess_only=True) - - return self.model.forward(token_ids[:, -1:], self.cache, **kwargs).float().cpu() - - def generate_with_streaming(self, prompt, state): - - # The cache batch size must be 2 for CFG and 1 otherwise - if state['guidance_scale'] == 1: - if self.cache.batch_size == 2: - del self.cache - clear_torch_cache() - self.cache = ExLlamaCache(self.model) - self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache) - else: - if self.cache.batch_size == 1: - del self.cache - clear_torch_cache() - self.cache = ExLlamaCache(self.model, batch_size=2) - self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache) - - self.generator.settings.temperature = state['temperature'] - self.generator.settings.top_p = state['top_p'] - self.generator.settings.top_k = state['top_k'] - self.generator.settings.typical = state['typical_p'] - self.generator.settings.token_repetition_penalty_max = state['repetition_penalty'] - self.generator.settings.token_repetition_penalty_sustain = -1 if state['repetition_penalty_range'] <= 0 else state['repetition_penalty_range'] - if state['ban_eos_token']: - self.generator.disallow_tokens([self.tokenizer.eos_token_id]) - else: - self.generator.disallow_tokens(None) - - if state['custom_token_bans']: - to_ban = [int(x) for x in state['custom_token_bans'].split(',')] - if len(to_ban) > 0: - self.generator.disallow_tokens(to_ban) - - # Case 1: no CFG - if state['guidance_scale'] == 1: - self.generator.end_beam_search() - - # Tokenizing the input - ids = self.generator.tokenizer.encode(prompt, max_seq_len=self.model.config.max_seq_len) - if state['add_bos_token']: - ids = torch.cat( - [torch.tensor([[self.tokenizer.bos_token_id]]).to(ids.device), - ids], dim=1 - ).to(torch.int64) - ids = ids[:, -get_max_prompt_length(state):] - if state['auto_max_new_tokens']: - max_new_tokens = state['truncation_length'] - ids.shape[-1] - else: - max_new_tokens = state['max_new_tokens'] - - self.generator.gen_begin_reuse(ids) - initial_len = self.generator.sequence[0].shape[0] - has_leading_space = False - - for i in range(max_new_tokens): - token = self.generator.gen_single_token() - if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): - has_leading_space = True - - decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) - if has_leading_space: - decoded_text = ' ' + decoded_text - - # Check the partial unicode character - if chr(0xfffd) in decoded_text: - is_last = i == max_new_tokens - 1 - is_stopping = token.item() == self.generator.tokenizer.eos_token_id or shared.stop_everything - # If we are not at the end of the generation, we skip this token - if not (is_last or is_stopping): - continue - - if token.item() == self.generator.tokenizer.eos_token_id or shared.stop_everything: - break - - yield decoded_text - - # Case 2: CFG - # Copied from https://github.com/turboderp/exllama/blob/master/example_cfg.py - else: - alpha = state['guidance_scale'] - prompts = [prompt, state['negative_prompt'] or ''] - - ids, mask = self.tokenizer.encode( - prompts, - return_mask=True, - max_seq_len=self.model.config.max_seq_len, - add_bos=state['add_bos_token'] - ) - if state['auto_max_new_tokens']: - max_new_tokens = state['truncation_length'] - ids[0].shape[-1] - else: - max_new_tokens = state['max_new_tokens'] - - self.generator.gen_begin(ids, mask=mask) - initial_len = self.generator.sequence[0].shape[0] - has_leading_space = False - - for i in range(max_new_tokens): - logits = self.model.forward(self.generator.sequence[:, -1:], self.cache, input_mask=mask) - self.generator.apply_rep_penalty(logits) - - logits = F.log_softmax(logits, dim=-1) - logits_mixed = alpha * logits[0] + (1 - alpha) * logits[1] - - token, _ = self.generator.sample_current(logits_mixed) - if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'): - has_leading_space = True - - decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:]) - if has_leading_space: - decoded_text = ' ' + decoded_text - - # Check the partial unicode character - if chr(0xfffd) in decoded_text: - is_last = i == max_new_tokens - 1 - is_stopping = token.item() == self.tokenizer.eos_token_id or shared.stop_everything - # If we are not at the end of the generation, we skip this token - if not (is_last or is_stopping): - continue - - yield decoded_text - if token.item() == self.tokenizer.eos_token_id or shared.stop_everything: - break - - batch_token = token.repeat(2, 1) - self.generator.gen_accept_token(batch_token) - - def generate(self, prompt, state): - output = '' - for output in self.generate_with_streaming(prompt, state): - pass - - return output diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py deleted file mode 100644 index 3ba1f3c386..0000000000 --- a/modules/exllama_hf.py +++ /dev/null @@ -1,174 +0,0 @@ -import os -from pathlib import Path -from typing import Any, Dict, Optional, Union - -import torch -from torch.nn import CrossEntropyLoss -from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel -from transformers.modeling_outputs import CausalLMOutputWithPast - -from modules import shared -from modules.logging_colors import logger - -try: - from exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig -except: - logger.warning('Exllama module failed to load. Will attempt to load from repositories.') - try: - from modules.relative_imports import RelativeImport - - with RelativeImport("repositories/exllama"): - from model import ExLlama, ExLlamaCache, ExLlamaConfig - except: - logger.error("Could not find repositories/exllama/. Make sure that exllama is cloned inside repositories/ and is up to date.") - raise - - -class ExllamaHF(PreTrainedModel): - def __init__(self, config: ExLlamaConfig): - super().__init__(PretrainedConfig()) - self.ex_config = config - self.ex_model = ExLlama(self.ex_config) - self.generation_config = GenerationConfig() - self.lora = None - - self.ex_cache = ExLlamaCache(self.ex_model) - self.past_seq = None - - if shared.args.cfg_cache: - self.ex_cache_negative = ExLlamaCache(self.ex_model) - self.past_seq_negative = None - - def _validate_model_class(self): - pass - - def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]): - pass - - def prepare_inputs_for_generation(self, input_ids, **kwargs): - return {'input_ids': input_ids, **kwargs} - - @property - def device(self) -> torch.device: - return torch.device(0) - - def __call__(self, *args, **kwargs): - use_cache = kwargs.get('use_cache', True) - labels = kwargs.get('labels', None) - past_key_values = kwargs.get('past_key_values', None) - - if len(args) > 0: - if not shared.args.cfg_cache: - logger.error("Please enable the cfg-cache option to use CFG with ExLlama_HF.") - return - - input_ids = args[0] - is_negative = True - past_seq = self.past_seq_negative - ex_cache = self.ex_cache_negative - else: - input_ids = kwargs['input_ids'] - is_negative = False - past_seq = self.past_seq - ex_cache = self.ex_cache - - seq = input_ids[0].tolist() - if is_negative and past_key_values is not None: - seq = past_key_values + seq - - seq_tensor = torch.tensor(seq) - reset = True - - # Make the forward call - if labels is None: - if past_seq is not None: - min_length = min(past_seq.shape[0], seq_tensor.shape[0]) - indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length])) - if len(indices) > 0: - longest_prefix = indices[0].item() - else: - longest_prefix = min_length - - if longest_prefix > 0: - reset = False - ex_cache.current_seq_len = longest_prefix - if len(seq_tensor) - longest_prefix > 1: - self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, lora=self.lora) - elif len(seq_tensor) == longest_prefix: - # Very tricky: if the prefix we are reusing *is* the input_ids, then we have to back up the cache pointer by one, - # because we feed input_ids[-1] to forward() below, but that last token is already in the cache! - ex_cache.current_seq_len -= 1 - - if reset: - ex_cache.current_seq_len = 0 - if len(seq_tensor) > 1: - self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True, lora=self.lora) - - logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache, lora=self.lora).to(input_ids.device) - else: - ex_cache.current_seq_len = 0 - logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False, lora=self.lora) - - if is_negative: - self.past_seq_negative = seq_tensor - else: - self.past_seq = seq_tensor - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, logits.shape[-1]) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) - - return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): - assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported" - if isinstance(pretrained_model_name_or_path, str): - pretrained_model_name_or_path = Path(pretrained_model_name_or_path) - - pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path) - config = ExLlamaConfig(pretrained_model_name_or_path / 'config.json') - - # from 'oobabooga/text-generation-webui/modules/exllama.py' - weight_path = None - for ext in ['.safetensors', '.pt', '.bin']: - found = list(pretrained_model_name_or_path.glob(f"*{ext}")) - if len(found) > 0: - weight_path = found[-1] - break - assert weight_path is not None, f'could not find weight in "{pretrained_model_name_or_path}"' - - config.model_path = str(weight_path) - config.max_seq_len = shared.args.max_seq_len - config.compress_pos_emb = shared.args.compress_pos_emb - if shared.args.gpu_split: - config.set_auto_map(shared.args.gpu_split) - config.gpu_peer_fix = True - - if shared.args.alpha_value > 1 and shared.args.rope_freq_base == 0: - config.alpha_value = shared.args.alpha_value - config.calculate_rotary_embedding_base() - elif shared.args.rope_freq_base > 0: - config.rotary_embedding_base = shared.args.rope_freq_base - - if torch.version.hip: - config.rmsnorm_no_half2 = True - config.rope_no_half2 = True - config.matmul_no_half2 = True - config.silu_no_half2 = True - - # This slowes down a bit but align better with autogptq generation. - # TODO: Should give user choice to tune the exllama config - # config.fused_attn = False - # config.fused_mlp_thd = 0 - - return ExllamaHF(config) diff --git a/modules/loaders.py b/modules/loaders.py index d8e268e9d8..238ae91b14 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -81,15 +81,15 @@ 'trust_remote_code', 'no_use_fast', ], - 'ExLlama_HF': [ + 'ExLlamav2': [ 'gpu_split', 'max_seq_len', + 'no_flash_attn', + 'num_experts_per_token', + 'cache_8bit', 'alpha_value', - 'rope_freq_base', 'compress_pos_emb', - 'cfg_cache', - 'trust_remote_code', - 'no_use_fast', + 'exllamav2_info', ], 'AutoGPTQ': [ 'triton', @@ -128,24 +128,6 @@ 'no_use_fast', 'gptq_for_llama_info', ], - 'ExLlamav2': [ - 'gpu_split', - 'max_seq_len', - 'no_flash_attn', - 'num_experts_per_token', - 'cache_8bit', - 'alpha_value', - 'compress_pos_emb', - 'exllamav2_info', - ], - 'ExLlama': [ - 'gpu_split', - 'max_seq_len', - 'alpha_value', - 'rope_freq_base', - 'compress_pos_emb', - 'exllama_info', - ], 'ctransformers': [ 'n_ctx', 'n_gpu_layers', @@ -216,54 +198,6 @@ def transformers_samplers(): 'AutoAWQ': transformers_samplers(), 'QuIP#': transformers_samplers(), 'HQQ': transformers_samplers(), - 'ExLlama_HF': { - 'temperature', - 'temperature_last', - 'top_p', - 'min_p', - 'top_k', - 'typical_p', - 'epsilon_cutoff', - 'eta_cutoff', - 'tfs', - 'top_a', - 'repetition_penalty', - 'presence_penalty', - 'frequency_penalty', - 'repetition_penalty_range', - 'encoder_repetition_penalty', - 'no_repeat_ngram_size', - 'min_length', - 'seed', - 'do_sample', - 'mirostat_mode', - 'mirostat_tau', - 'mirostat_eta', - 'grammar_file_row', - 'grammar_string', - 'guidance_scale', - 'negative_prompt', - 'ban_eos_token', - 'custom_token_bans', - 'add_bos_token', - 'skip_special_tokens', - 'auto_max_new_tokens', - }, - 'ExLlama': { - 'temperature', - 'top_p', - 'top_k', - 'typical_p', - 'repetition_penalty', - 'repetition_penalty_range', - 'seed', - 'guidance_scale', - 'negative_prompt', - 'ban_eos_token', - 'add_bos_token', - 'custom_token_bans', - 'auto_max_new_tokens', - }, 'ExLlamav2': { 'temperature', 'top_p', diff --git a/modules/logits.py b/modules/logits.py index e12cf6e785..19a9993f4d 100644 --- a/modules/logits.py +++ b/modules/logits.py @@ -14,11 +14,10 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=50, return return 'Error: No model is loaded1 Select one in the Model tab.', previous is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model' - is_non_hf_exllamav1 = shared.model.__class__.__name__ == 'ExllamaModel' is_non_hf_llamacpp = shared.model.__class__.__name__ == 'LlamaCppModel' if use_samplers: - if any([is_non_hf_exllamav2, is_non_hf_exllamav1, is_non_hf_llamacpp]): + if any([is_non_hf_exllamav2, is_non_hf_llamacpp]): logger.error("Sampler hijacking is not supported non-Huggingface loaders.") # sampling is all done in c for exllama, so it is really hard to hijack # it should be possible to hijack llamacpp sampler by hijacking all their sampling methods, @@ -32,7 +31,7 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=50, return scores = sampler_hijack.global_scores[-1] else: - if is_non_hf_exllamav2 or is_non_hf_exllamav1: + if is_non_hf_exllamav2: if is_torch_xpu_available(): tokens = shared.tokenizer.encode(prompt).to("xpu:0") else: @@ -51,7 +50,7 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=50, return probs = torch.softmax(scores, dim=-1, dtype=torch.float) topk_values, topk_indices = torch.topk(probs, k=top_logits, largest=True, sorted=True) - if is_non_hf_exllamav1 or is_non_hf_llamacpp: + if is_non_hf_llamacpp: topk_indices = [i.expand((1, 1)) for i in topk_indices] if hasattr(shared.tokenizer, 'convert_ids_to_tokens'): diff --git a/modules/models.py b/modules/models.py index 5235f10857..ed6f6b526a 100644 --- a/modules/models.py +++ b/modules/models.py @@ -66,8 +66,6 @@ def load_model(model_name, loader=None): 'llama.cpp': llamacpp_loader, 'llamacpp_HF': llamacpp_HF_loader, 'RWKV': RWKV_loader, - 'ExLlama': ExLlama_loader, - 'ExLlama_HF': ExLlama_HF_loader, 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ctransformers': ctransformers_loader, @@ -382,19 +380,6 @@ def AutoGPTQ_loader(model_name): return modules.AutoGPTQ_loader.load_quantized(model_name) -def ExLlama_loader(model_name): - from modules.exllama import ExllamaModel - - model, tokenizer = ExllamaModel.from_pretrained(model_name) - return model, tokenizer - - -def ExLlama_HF_loader(model_name): - from modules.exllama_hf import ExllamaHF - - return ExllamaHF.from_pretrained(model_name) - - def ExLlamav2_loader(model_name): from modules.exllamav2 import Exllamav2Model diff --git a/modules/models_settings.py b/modules/models_settings.py index 919a15bc27..d508227acc 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -41,13 +41,11 @@ def get_model_metadata(model): if 'loader' not in model_settings: if hf_metadata is not None and 'quip_params' in hf_metadata: - model_settings['loader'] = 'QuIP#' + loader = 'QuIP#' else: loader = infer_loader(model, model_settings) - if 'wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0: - loader = 'AutoGPTQ' - model_settings['loader'] = loader + model_settings['loader'] = loader # GGUF metadata if model_settings['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']: @@ -152,7 +150,7 @@ def infer_loader(model_name, model_settings): if not path_to_model.exists(): loader = None elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0): - loader = 'ExLlama_HF' + loader = 'ExLlamav2_HF' elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()): loader = 'AutoAWQ' elif len(list(path_to_model.glob('*.gguf'))) > 0: @@ -229,7 +227,7 @@ def apply_model_settings_to_state(model, state): loader = model_settings.pop('loader') # If the user is using an alternative loader for the same model type, let them keep using it - if not (loader == 'AutoGPTQ' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlama', 'ExLlama_HF', 'ExLlamav2', 'ExLlamav2_HF']) and not (loader == 'llama.cpp' and state['loader'] in ['llamacpp_HF', 'ctransformers']): + if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']) and not (loader == 'llama.cpp' and state['loader'] in ['llamacpp_HF', 'ctransformers']): state['loader'] = loader for k in model_settings: diff --git a/modules/shared.py b/modules/shared.py index 36ace23c2d..60b3f8f4ad 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -85,7 +85,7 @@ # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlama_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ExLlama, ExLlamav2, ctransformers, QuIP#.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -131,7 +131,7 @@ group = parser.add_argument_group('ExLlama') group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.') group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.') -group.add_argument('--cfg-cache', action='store_true', help='ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama.') +group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.') group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') @@ -260,8 +260,6 @@ def fix_loader_name(name): return 'GPTQ-for-LLaMa' elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']: return 'ExLlama' - elif name in ['exllama-hf', 'exllama_hf', 'exllama hf', 'ex-llama-hf', 'ex_llama_hf']: - return 'ExLlama_HF' elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']: return 'ExLlamav2' elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']: diff --git a/modules/text_generation.py b/modules/text_generation.py index 49ae6fdea2..b39a037f24 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -44,7 +44,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap yield '' return - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'Exllamav2Model', 'CtransformersModel']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel']: generate_func = generate_reply_custom else: generate_func = generate_reply_HF @@ -132,7 +132,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu: return input_ids elif shared.args.deepspeed: return input_ids.to(device=local_rank) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index ae8194b1be..7b28a34cea 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -96,7 +96,7 @@ def create_ui(): shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None") shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None") shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0) - shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from Llama.') + shared.gradio['autogptq_info'] = gr.Markdown('* ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len) shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) @@ -134,8 +134,7 @@ def create_ui(): shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') - shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlama_HF or AutoGPTQ are preferred for GPTQ models when supported.') - shared.gradio['exllama_info'] = gr.Markdown("ExLlama_HF is recommended over ExLlama for better integration with extensions and more consistent sampling behavior across loaders.") + shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.') shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['llamacpp_HF_info'] = gr.Markdown('llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to download a tokenizer.\n\nOption 1 (recommended): place your .gguf in a subfolder of models/ along with these 4 files: special_tokens_map.json, tokenizer_config.json, tokenizer.json, tokenizer.model.\n\nOption 2: download `oobabooga/llama-tokenizer` under "Download model or LoRA". That\'s a default Llama tokenizer that will work for some (but not all) models.') diff --git a/one_click.py b/one_click.py index 76e8580ecc..e0ef795aa2 100644 --- a/one_click.py +++ b/one_click.py @@ -343,27 +343,6 @@ def update_requirements(initial_installation=False): if not os.path.exists("repositories/"): os.mkdir("repositories") - os.chdir("repositories") - - # Install or update ExLlama as needed - if not os.path.exists("exllama/"): - run_cmd("git clone https://github.com/turboderp/exllama.git", environment=True) - else: - os.chdir("exllama") - run_cmd("git pull", environment=True) - os.chdir("..") - - if is_linux(): - # Fix JIT compile issue with ExLlama in Linux/WSL - if not os.path.exists(f"{conda_env_path}/lib64"): - run_cmd(f'ln -s "{conda_env_path}/lib" "{conda_env_path}/lib64"', environment=True) - - # On some Linux distributions, g++ may not exist or be the wrong version to compile GPTQ-for-LLaMa - gxx_output = run_cmd("g++ -dumpfullversion -dumpversion", environment=True, capture_output=True) - if gxx_output.returncode != 0 or int(gxx_output.stdout.strip().split(b".")[0]) > 11: - # Install the correct version of g++ - run_cmd("conda install -y -k conda-forge::gxx_linux-64=11.2.0", environment=True) - clear_cache() diff --git a/requirements.txt b/requirements.txt index e0a86a3ea3..f5cde1c1ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -66,14 +66,6 @@ https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu1 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/turboderp/exllamav2/releases/download/v0.0.11/exllamav2-0.0.11+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp/exllamav2/releases/download/v0.0.11/exllamav2-0.0.11+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/turboderp/exllamav2/releases/download/v0.0.11/exllamav2-0.0.11+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" diff --git a/requirements_amd.txt b/requirements_amd.txt index 537225c0b3..a3d9f90ef3 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -46,10 +46,6 @@ https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+roc https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/turboderp/exllamav2/releases/download/v0.0.11/exllamav2-0.0.11+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp/exllamav2/releases/download/v0.0.11/exllamav2-0.0.11+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index e2bdb294cd..10e410322f 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -42,10 +42,6 @@ https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+roc https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+rocm5.6-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/turboderp/exllamav2/releases/download/v0.0.11/exllamav2-0.0.11+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp/exllamav2/releases/download/v0.0.11/exllamav2-0.0.11+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index f8e8fdc87b..a9c9b35fa2 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -66,14 +66,6 @@ https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu1 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-win_amd64.whl; platform_system == "Windows" and python_version == "3.8" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp39-cp39-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.9" -https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp38-cp38-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.8" https://github.com/turboderp/exllamav2/releases/download/v0.0.11/exllamav2-0.0.11+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp/exllamav2/releases/download/v0.0.11/exllamav2-0.0.11+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/turboderp/exllamav2/releases/download/v0.0.11/exllamav2-0.0.11+cu121-cp39-cp39-win_amd64.whl; platform_system == "Windows" and python_version == "3.9" From 2734ce3e4c1241ff5d51a173bcbe71c5839a3f91 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 31 Dec 2023 02:01:40 -0300 Subject: [PATCH 07/10] Remove RWKV loader (#5130) --- README.md | 7 -- modules/RWKV.py | 154 ------------------------------------- modules/models.py | 18 ----- modules/models_settings.py | 2 - modules/shared.py | 5 -- modules/text_generation.py | 6 +- 6 files changed, 3 insertions(+), 189 deletions(-) delete mode 100644 modules/RWKV.py diff --git a/README.md b/README.md index 5df4d6bc53..93bbf0e394 100644 --- a/README.md +++ b/README.md @@ -312,13 +312,6 @@ List of command-line flags | `--nvme-offload-dir NVME_OFFLOAD_DIR` | DeepSpeed: Directory to use for ZeRO-3 NVME offloading. | | `--local_rank LOCAL_RANK` | DeepSpeed: Optional argument for distributed setups. | -#### RWKV - -| Flag | Description | -|---------------------------------|-------------| -| `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". | -| `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. | - #### RoPE (for llama.cpp, ExLlamaV2, and transformers) | Flag | Description | diff --git a/modules/RWKV.py b/modules/RWKV.py deleted file mode 100644 index 8a15e5406f..0000000000 --- a/modules/RWKV.py +++ /dev/null @@ -1,154 +0,0 @@ -''' -This loader is not currently maintained as RWKV can now be loaded -through the transformers library. -''' - -import copy -import os -from pathlib import Path - -import numpy as np -from tokenizers import Tokenizer -from transformers import is_torch_xpu_available - -import modules.shared as shared -from modules.callbacks import Iteratorize - -np.set_printoptions(precision=4, suppress=True, linewidth=200) - -os.environ['RWKV_JIT_ON'] = '1' -os.environ["RWKV_CUDA_ON"] = '1' if shared.args.rwkv_cuda_on else '0' # use CUDA kernel for seq mode (much faster) - -from rwkv.model import RWKV -from rwkv.utils import PIPELINE, PIPELINE_ARGS - - -class RWKVModel: - def __init__(self): - pass - - @classmethod - def from_pretrained(self, path, dtype="bf16" if is_torch_xpu_available() else "fp16", device="xpu" if is_torch_xpu_available() else "cuda"): - tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json") - if shared.args.rwkv_strategy is None: - model = RWKV(model=str(path), strategy=f'{device} {dtype}') - else: - model = RWKV(model=str(path), strategy=shared.args.rwkv_strategy) - - pipeline = PIPELINE(model, str(tokenizer_path)) - result = self() - result.pipeline = pipeline - result.model = model - result.cached_context = "" - result.cached_model_state = None - result.cached_output_logits = None - return result - - def generate(self, prompt, state, callback=None): - args = PIPELINE_ARGS( - temperature=state['temperature'], - top_p=state['top_p'], - top_k=state['top_k'], - alpha_frequency=0.1, # Frequency Penalty (as in GPT-3) - alpha_presence=0.1, # Presence Penalty (as in GPT-3) - token_ban=[0], # ban the generation of some tokens - token_stop=[] - ) - - if self.cached_context != "": - if prompt.startswith(self.cached_context): - prompt = prompt[len(self.cached_context):] - else: - self.cached_context = "" - self.cached_model_state = None - self.cached_output_logits = None - - # out = self.pipeline.generate(prompt, token_count=state['max_new_tokens'], args=args, callback=callback) - out = self.generate_from_cached_state(prompt, token_count=state['max_new_tokens'], args=args, callback=callback) - return out - - def generate_with_streaming(self, *args, **kwargs): - with Iteratorize(self.generate, args, kwargs, callback=None) as generator: - reply = '' - for token in generator: - reply += token - yield reply - - # Similar to the PIPELINE.generate, but lets us maintain the cached_model_state - def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback=None): - all_tokens = [] - out_str = '' - occurrence = {} - state = copy.deepcopy(self.cached_model_state) if self.cached_model_state is not None else None - - # if we ended up with an empty context, just reuse the cached logits - # this can happen if a user undoes a message and then sends the exact message again - # in that case the full context ends up being the same as the cached_context, so the remaining context is empty. - if ctx == "": - out = self.cached_output_logits - - token = None - for i in range(token_count): - # forward - tokens = self.pipeline.encode(ctx) if i == 0 else [token] - while len(tokens) > 0: - out, state = self.model.forward(tokens[:args.chunk_len], state) - tokens = tokens[args.chunk_len:] - if i == 0: - begin_token = len(all_tokens) - last_token_posi = begin_token - # cache the model state after scanning the context - # we don't cache the state after processing our own generated tokens because - # the output string might be post-processed arbitrarily. Therefore, what's fed into the model - # on the next round of chat might be slightly different what what it output on the previous round - if i == 0: - self.cached_context += ctx - self.cached_model_state = copy.deepcopy(state) - self.cached_output_logits = copy.deepcopy(out) - - # adjust probabilities - for n in args.token_ban: - out[n] = -float('inf') - - for n in occurrence: - out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency) - - # sampler - token = self.pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k) - if token in args.token_stop: - break - - all_tokens += [token] - if token not in occurrence: - occurrence[token] = 1 - else: - occurrence[token] += 1 - - # output - tmp = self.pipeline.decode(all_tokens[last_token_posi:]) - if '\ufffd' not in tmp: # is valid utf-8 string? - if callback: - callback(tmp) - - out_str += tmp - last_token_posi = begin_token + i + 1 - return out_str - - -class RWKVTokenizer: - def __init__(self): - pass - - @classmethod - def from_pretrained(self, path): - tokenizer_path = path / "20B_tokenizer.json" - tokenizer = Tokenizer.from_file(str(tokenizer_path)) - result = self() - result.tokenizer = tokenizer - return result - - def encode(self, prompt): - return self.tokenizer.encode(prompt).ids - - def decode(self, ids): - return self.tokenizer.decode(ids) diff --git a/modules/models.py b/modules/models.py index ed6f6b526a..7f338712b5 100644 --- a/modules/models.py +++ b/modules/models.py @@ -65,7 +65,6 @@ def load_model(model_name, loader=None): 'GPTQ-for-LLaMa': GPTQ_loader, 'llama.cpp': llamacpp_loader, 'llamacpp_HF': llamacpp_HF_loader, - 'RWKV': RWKV_loader, 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ctransformers': ctransformers_loader, @@ -405,23 +404,6 @@ def HQQ_loader(model_name): return model -def RWKV_loader(model_name): - ''' - This loader is not currently maintained as RWKV can now be loaded - through the transformers library. - ''' - from modules.RWKV import RWKVModel, RWKVTokenizer - - model = RWKVModel.from_pretrained( - Path(f'{shared.args.model_dir}/{model_name}'), - dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", - device="cpu" if shared.args.cpu else "xpu" if is_xpu_available() else "cuda" - ) - - tokenizer = RWKVTokenizer.from_pretrained(Path(shared.args.model_dir)) - return model, tokenizer - - def get_max_memory_dict(): max_memory = {} max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' diff --git a/modules/models_settings.py b/modules/models_settings.py index d508227acc..9acc7efa3b 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -157,8 +157,6 @@ def infer_loader(model_name, model_settings): loader = 'llama.cpp' elif re.match(r'.*\.gguf', model_name.lower()): loader = 'llama.cpp' - elif re.match(r'.*rwkv.*\.pth', model_name.lower()): - loader = 'RWKV' elif re.match(r'.*exl2', model_name.lower()): loader = 'ExLlamav2_HF' elif re.match(r'.*-hqq', model_name.lower()): diff --git a/modules/shared.py b/modules/shared.py index 60b3f8f4ad..785d5509c0 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -165,11 +165,6 @@ group.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.') group.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.') -# RWKV -group = parser.add_argument_group('RWKV') -group.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".') -group.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.') - # RoPE group = parser.add_argument_group('RoPE') group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.') diff --git a/modules/text_generation.py b/modules/text_generation.py index b39a037f24..d43801882d 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -44,7 +44,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap yield '' return - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']: generate_func = generate_reply_custom else: generate_func = generate_reply_HF @@ -118,7 +118,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if shared.tokenizer is None: raise ValueError('No tokenizer is loaded') - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'CtransformersModel', 'Exllamav2Model']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']: input_ids = shared.tokenizer.encode(str(prompt)) if shared.model.__class__.__name__ not in ['Exllamav2Model']: input_ids = np.array(input_ids).reshape(1, len(input_ids)) @@ -132,7 +132,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu: return input_ids elif shared.args.deepspeed: return input_ids.to(device=local_rank) From 485b85ee76329c1d45473102486abd3e5d1388f5 Mon Sep 17 00:00:00 2001 From: TheInvisibleMage <64934262+TheInvisibleMage@users.noreply.github.com> Date: Sun, 31 Dec 2023 16:03:23 +1100 Subject: [PATCH 08/10] Superboogav2 Quick Fixes (#5089) --- extensions/superboogav2/chat_handler.py | 28 ++++++--------------- extensions/superboogav2/notebook_handler.py | 4 +-- extensions/superboogav2/script.py | 4 +-- 3 files changed, 11 insertions(+), 25 deletions(-) diff --git a/extensions/superboogav2/chat_handler.py b/extensions/superboogav2/chat_handler.py index 215f7bdbc7..419b926451 100644 --- a/extensions/superboogav2/chat_handler.py +++ b/extensions/superboogav2/chat_handler.py @@ -1,14 +1,14 @@ """ This module is responsible for modifying the chat prompt and history. """ -import json import re import extensions.superboogav2.parameters as parameters -from modules import chat +from modules import chat, shared from modules.text_generation import get_encoded_length from modules.logging_colors import logger +from modules.chat import load_character_memoized from extensions.superboogav2.utils import create_context_text, create_metadata_source from .data_processor import process_and_add_to_collector @@ -17,14 +17,6 @@ CHAT_METADATA = create_metadata_source('automatic-chat-insert') -INSTRUCT_MODE = 'instruct' -CHAT_INSTRUCT_MODE = 'chat-instruct' - - -def _is_instruct_mode(state: dict): - mode = state.get('mode') - return mode == INSTRUCT_MODE or mode == CHAT_INSTRUCT_MODE - def _remove_tag_if_necessary(user_input: str): if not parameters.get_is_manual(): @@ -51,17 +43,11 @@ def _format_single_exchange(name, text): def _get_names(state: dict): - if _is_instruct_mode(state): - user_name = state['name1_instruct'] - bot_name = state['name2_instruct'] - else: - user_name = state['name1'] - bot_name = state['name2'] - - if not user_name: - user_name = 'User' - if not bot_name: - bot_name = 'Assistant' + default_char = shared.settings.get('character', "Assistant") + default_user = shared.settings.get('name1', "You") + character = state.get('character', default_char) + user_name = state.get('name1', default_user) + user_name, bot_name, _, _, _ = load_character_memoized(character, user_name, '') return user_name, bot_name diff --git a/extensions/superboogav2/notebook_handler.py b/extensions/superboogav2/notebook_handler.py index 9faadfed12..7b86434969 100644 --- a/extensions/superboogav2/notebook_handler.py +++ b/extensions/superboogav2/notebook_handler.py @@ -16,9 +16,9 @@ def _remove_special_tokens(string): return re.sub(pattern, '', string) -def input_modifier_internal(string, collector): +def input_modifier_internal(string, collector, is_chat): # Sanity check. - if shared.is_chat(): + if is_chat: return string # Find the user input diff --git a/extensions/superboogav2/script.py b/extensions/superboogav2/script.py index 0870ab4c3b..66f56e29ea 100644 --- a/extensions/superboogav2/script.py +++ b/extensions/superboogav2/script.py @@ -167,8 +167,8 @@ def custom_generate_chat_prompt(user_input, state, **kwargs): return custom_generate_chat_prompt_internal(user_input, state, collector, **kwargs) -def input_modifier(string): - return input_modifier_internal(string, collector) +def input_modifier(string, state, is_chat=False): + return input_modifier_internal(string, collector, is_chat) def ui(): From 2aad91f3c9bc48eaee38ba313d2b928e6faa56e0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 31 Dec 2023 02:07:48 -0300 Subject: [PATCH 09/10] Remove deprecated command-line flags (#5131) --- modules/shared.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index 785d5509c0..b41941077e 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -198,15 +198,7 @@ group.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.') # Deprecated parameters -group = parser.add_argument_group('Deprecated') -group.add_argument('--notebook', action='store_true', help='DEPRECATED') -group.add_argument('--chat', action='store_true', help='DEPRECATED') -group.add_argument('--no-stream', action='store_true', help='DEPRECATED') -group.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED') -group.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED') -group.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED') -group.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED') -group.add_argument('--use_fast', action='store_true', help='DEPRECATED') +# group = parser.add_argument_group('Deprecated') args = parser.parse_args() args_defaults = parser.parse_args([]) @@ -216,7 +208,7 @@ if hasattr(args, arg): provided_arguments.append(arg) -deprecated_args = ['notebook', 'chat', 'no_stream', 'mul_mat_q', 'use_fast'] +deprecated_args = [] def do_cmd_flags_warnings(): From cbf6f9e6956f754e88b4c9f706f21365ffc1c1cb Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 30 Dec 2023 21:31:17 -0800 Subject: [PATCH 10/10] Update some UI messages --- modules/ui_model_menu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 7b28a34cea..3c8bca8ff6 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -96,13 +96,13 @@ def create_ui(): shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None") shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None") shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0) - shared.gradio['autogptq_info'] = gr.Markdown('* ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') + shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len) shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base) shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb) - shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# only works on Linux.') + shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.') with gr.Column(): shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.')