oobabooga · oobabooga · Dec 31, 2023 · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023
diff --git a/.gitignore b/.gitignore
@@ -30,6 +30,7 @@
 venv
 .envrc
 .direnv
+.vs
 .vscode
 *.bak
 *.ipynb

diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 * 3 interface modes: default (two columns), notebook, and chat.
-* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlama](https://github.com/turboderp/exllama), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [CTransformers](https://github.com/marella/ctransformers), [QuIP#](https://github.com/Cornell-RelaxML/quip-sharp).
+* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [CTransformers](https://github.com/marella/ctransformers), [QuIP#](https://github.com/Cornell-RelaxML/quip-sharp).
 * Dropdown menu for quickly switching between different models.
 * Large number of extensions (built-in and user-contributed), including Coqui TTS for realistic voice outputs, Whisper STT for voice inputs, translation, [multimodal pipelines](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal), vector databases, Stable Diffusion integration, and a lot more. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 * [Chat with custom characters](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab#character).
@@ -140,13 +140,6 @@ Then browse to
 3) Manually install AutoGPTQ: [Installation](https://github.com/PanQiWei/AutoGPTQ#install-from-source).
     * Perform the from-source installation - there are no prebuilt ROCm packages for Windows.
 
-4) Manually install [ExLlama](https://github.com/turboderp/exllama) by simply cloning it into the `repositories` folder (it will be automatically compiled at runtime after that):
-
-```sh
-cd text-generation-webui
-git clone https://github.com/turboderp/exllama repositories/exllama
-```
-
 ##### Older NVIDIA GPUs
 
 1) For Kepler GPUs and older, you will need to install CUDA 11.8 instead of 12:
@@ -216,7 +209,7 @@ List of command-line flags
 
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
-| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlama_HF, ExLlamav2_HF, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ExLlama, ExLlamav2, ctransformers, QuIP#. |
+| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#. |
 
 #### Accelerate/transformers
 
@@ -231,8 +224,6 @@ List of command-line flags
 | `--load-in-8bit`                            | Load the model with 8-bit precision (using bitsandbytes). |
 | `--bf16`                                    | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
 | `--no-cache`                                | Set `use_cache` to `False` while generating text. This reduces VRAM usage slightly, but it comes at a performance cost. |
-| `--xformers`                                | Use xformer's memory efficient attention. This is really old and probably doesn't do anything. |
-| `--sdp-attention`                           | Use PyTorch 2.0's SDP attention. Same as above. |
 | `--trust-remote-code`                       | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
 | `--no_use_fast`                             | Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast. |
 | `--use_flash_attention_2`                   | Set use_flash_attention_2=True while loading the model. |
@@ -267,13 +258,13 @@ List of command-line flags
 | `--no_offload_kqv` | Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. |
 | `--cache-capacity CACHE_CAPACITY`   | Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
 
-#### ExLlama
+#### ExLlamav2
 
 | Flag             | Description |
 |------------------|-------------|
 |`--gpu-split`     | Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7. |
 |`--max_seq_len MAX_SEQ_LEN`           | Maximum sequence length. |
-|`--cfg-cache`                         | ExLlama_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader, but not necessary for CFG with base ExLlama. |
+|`--cfg-cache`                         | ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader. |
 |`--no_flash_attn`                     | Force flash-attention to not be used. |
 |`--cache_8bit`                        | Use 8-bit cache to save VRAM. |
 |`--num_experts_per_token NUM_EXPERTS_PER_TOKEN` |  Number of experts to use for generation. Applies to MoE models like Mixtral. |
@@ -321,14 +312,7 @@ List of command-line flags
 | `--nvme-offload-dir NVME_OFFLOAD_DIR` | DeepSpeed: Directory to use for ZeRO-3 NVME offloading. |
 | `--local_rank LOCAL_RANK`             | DeepSpeed: Optional argument for distributed setups. |
 
-#### RWKV
-
-| Flag                            | Description |
-|---------------------------------|-------------|
-| `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
-| `--rwkv-cuda-on`                | RWKV: Compile the CUDA kernel for better performance. |
-
-#### RoPE (for llama.cpp, ExLlama, ExLlamaV2, and transformers)
+#### RoPE (for llama.cpp, ExLlamaV2, and transformers)
 
 | Flag             | Description |
 |------------------|-------------|

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
@@ -1,13 +1,13 @@
 .chat {
-  background: var(--block-background-fill);
-  padding: 24px 19px;
-  padding-right: 19px !important;
-  padding-top: 0px;
-  border: 1px solid var(--block-border-color);
+    background: var(--block-background-fill);
+    padding: 24px 19px;
+    padding-right: 19px !important;
+    padding-top: 0;
+    border: 1px solid var(--block-border-color);
 }
 
 .chat > .messages {
-  padding-top: 28px !important;
+    padding-top: 28px !important;
 }
 
 .message {
@@ -62,7 +62,6 @@
 .gradio-container .chat .user-message {
     padding: 20px;
     background-color: var(--color-accent-soft);
-    border-radius: 20px;
     margin-bottom: 12px !important;
     margin-left: 16px;
     border-radius: 22px;

diff --git a/css/main.css b/css/main.css
@@ -92,7 +92,7 @@ div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * {
 .header_bar {
     background-color: #f7f7f7;
     box-shadow: 0 2px 3px rgba(22 22 22 / 35%);
-    margin-bottom: 0px;
+    margin-bottom: 0;
     overflow-x: scroll;
     margin-left: calc(-1 * var(--size-4));
     margin-right: calc(-1 * var(--size-4));
@@ -303,7 +303,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-tab {
-    padding-top: 0px;
+    padding-top: 0;
 }
 
 #chat-tab button#Generate, #chat-tab button#stop {

diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md
@@ -32,32 +32,21 @@ Options:
 * **use_flash_attention_2**: Set use_flash_attention_2=True while loading the model. Possibly useful for training.
 * **disable_exllama**: Only applies when you are loading a GPTQ model through the transformers loader. It needs to be checked if you intend to train LoRAs with the model.
 
-### ExLlama_HF
+### ExLlamav2_HF
 
-Loads: GPTQ models. They usually have GPTQ in the model name, or alternatively something like "-4bit-128g" in the name.
+Loads: GPTQ and EXL2 models. EXL2 models usually have "EXL2" in the model name, while GPTQ models usually have GPTQ in the model name, or alternatively something like "-4bit-128g" in the name.
 
-Example: https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ
+Examples:
 
-ExLlama_HF is the v1 of ExLlama (https://github.com/turboderp/exllama) connected to the transformers library for sampling, tokenizing, and detokenizing. It is very fast and memory-efficient.
+* https://huggingface.co/turboderp/Llama2-70B-exl2
+* https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ
 
 * **gpu-split**: If you have multiple GPUs, the amount of memory to allocate per GPU should be set in this field. Make sure to set a lower value for the first GPU, as that's where the cache is allocated.
 * **max_seq_len**: The maximum sequence length for the model. In ExLlama, the cache is preallocated, so the higher this value, the higher the VRAM. It is automatically set to the maximum sequence length for the model based on its metadata, but you may need to lower this value be able to fit the model into your GPU. After loading the model, the "Truncate the prompt up to this length" parameter under "Parameters" > "Generation" is automatically set to your chosen "max_seq_len" so that you don't have to set the same thing twice.
 * **cfg-cache**: Creates a second cache to hold the CFG negative prompts. You need to set this if and only if you intend to use CFG in the "Parameters" > "Generation" tab. Checking this parameter doubles the cache VRAM usage.
 * **no_flash_attn**: Disables flash attention. Otherwise, it is automatically used as long as the library is installed.
 * **cache_8bit**: Create a 8-bit precision cache instead of a 16-bit one. This saves VRAM but increases perplexity (I don't know by how much).
 
-### ExLlamav2_HF
-
-Loads: GPTQ and EXL2 models. EXL2 models usually have "EXL2" in the model name.
-
-Example: https://huggingface.co/turboderp/Llama2-70B-exl2
-
-The parameters are the same as in ExLlama_HF.
-
-### ExLlama
-
-The same as ExLlama_HF but using the internal samplers of ExLlama instead of the ones in the Transformers library.
-
 ### ExLlamav2
 
 The same as ExLlamav2_HF but using the internal samplers of ExLlamav2 instead of the ones in the Transformers library.

diff --git a/docs/What Works.md b/docs/What Works.md
@@ -3,9 +3,7 @@
 | Loader         | Loading 1 LoRA | Loading 2 or more LoRAs | Training LoRAs | Multimodal extension | Perplexity evaluation |
 |----------------|----------------|-------------------------|----------------|----------------------|-----------------------|
 | Transformers   |       ✅       |           ✅***            |       ✅*       |          ✅          |           ✅          |
-| ExLlama_HF     |       ✅       |           ❌            |       ❌       |          ❌          |           ✅          |
 | ExLlamav2_HF   |       ✅       |           ✅            |       ❌       |          ❌          |           ✅          |
-| ExLlama        |       ✅       |           ❌            |       ❌       |          ❌          |           use ExLlama_HF      |
 | ExLlamav2      |       ✅       |           ✅            |       ❌       |          ❌          |           use ExLlamav2_HF    |
 | AutoGPTQ       |       ✅       |           ❌            |       ❌       |          ✅          |           ✅          |
 | GPTQ-for-LLaMa |       ✅**       |           ✅***            |       ✅       |          ✅          |           ✅          |

diff --git a/extensions/superboogav2/chat_handler.py b/extensions/superboogav2/chat_handler.py
@@ -1,14 +1,14 @@
 """
 This module is responsible for modifying the chat prompt and history.
 """
-import json
 import re
 
 import extensions.superboogav2.parameters as parameters
 
-from modules import chat
+from modules import chat, shared
 from modules.text_generation import get_encoded_length
 from modules.logging_colors import logger
+from modules.chat import load_character_memoized
 from extensions.superboogav2.utils import create_context_text, create_metadata_source
 
 from .data_processor import process_and_add_to_collector
@@ -17,14 +17,6 @@
 
 CHAT_METADATA = create_metadata_source('automatic-chat-insert')
 
-INSTRUCT_MODE = 'instruct'
-CHAT_INSTRUCT_MODE = 'chat-instruct'
-
-
-def _is_instruct_mode(state: dict):
-    mode = state.get('mode')
-    return mode == INSTRUCT_MODE or mode == CHAT_INSTRUCT_MODE
-
 
 def _remove_tag_if_necessary(user_input: str):
     if not parameters.get_is_manual():
@@ -51,17 +43,11 @@ def _format_single_exchange(name, text):
 
 
 def _get_names(state: dict):
-    if _is_instruct_mode(state):
-        user_name = state['name1_instruct']
-        bot_name = state['name2_instruct']
-    else:
-        user_name = state['name1']
-        bot_name = state['name2']
-
-    if not user_name:
-        user_name = 'User'
-    if not bot_name:
-        bot_name = 'Assistant'
+    default_char = shared.settings.get('character', "Assistant")
+    default_user = shared.settings.get('name1', "You")
+    character = state.get('character', default_char)
+    user_name = state.get('name1', default_user)
+    user_name, bot_name, _, _, _ = load_character_memoized(character, user_name, '')
 
     return user_name, bot_name
 

diff --git a/extensions/superboogav2/notebook_handler.py b/extensions/superboogav2/notebook_handler.py
@@ -16,9 +16,9 @@ def _remove_special_tokens(string):
     return re.sub(pattern, '', string)
 
 
-def input_modifier_internal(string, collector):
+def input_modifier_internal(string, collector, is_chat):
     # Sanity check.
-    if shared.is_chat():
+    if is_chat:
         return string
 
     # Find the user input

diff --git a/extensions/superboogav2/script.py b/extensions/superboogav2/script.py
@@ -167,8 +167,8 @@ def custom_generate_chat_prompt(user_input, state, **kwargs):
     return custom_generate_chat_prompt_internal(user_input, state, collector, **kwargs)
 
 
-def input_modifier(string):
-    return input_modifier_internal(string, collector)
+def input_modifier(string, state, is_chat=False):
+    return input_modifier_internal(string, collector, is_chat)
 
 
 def ui():

diff --git a/js/main.js b/js/main.js
@@ -178,11 +178,11 @@ for(i = 0; i < noBackgroundelements.length; i++) {
   noBackgroundelements[i].parentNode.parentNode.parentNode.style.alignItems = "center";
 }
 
-const slimDropdownElements = document.querySelectorAll('.slim-dropdown');
+const slimDropdownElements = document.querySelectorAll(".slim-dropdown");
 for (i = 0; i < slimDropdownElements.length; i++) {
-    const parentNode = slimDropdownElements[i].parentNode;
-    parentNode.style.background = 'transparent';
-    parentNode.style.border = '0';
+  const parentNode = slimDropdownElements[i].parentNode;
+  parentNode.style.background = "transparent";
+  parentNode.style.border = "0";
 }
 
 //------------------------------------------------
@@ -313,7 +313,7 @@ function addBigPicture() {
 }
 
 function deleteBigPicture() {
-  var bigProfilePictures = document.querySelectorAll('.bigProfilePicture');
+  var bigProfilePictures = document.querySelectorAll(".bigProfilePicture");
   bigProfilePictures.forEach(function (element) {
     element.parentNode.removeChild(element);
   });
@@ -337,25 +337,27 @@ let currentChatInputHeight = 0;
 
 function updateCssProperties() {
   // Set the height of the chat area
-  const chatContainer = document.getElementById('chat').parentNode.parentNode.parentNode;
-  const chatInputHeight = document.querySelector('#chat-input textarea').clientHeight;
-  const newChatHeight = `${chatContainer.clientHeight - chatInputHeight + 40}px`;
-  document.documentElement.style.setProperty('--chat-height', newChatHeight);
-  document.documentElement.style.setProperty('--input-delta', `${chatInputHeight - 40}px`);
-
-  // Set the position offset of the chat input box
-  const header = document.querySelector('.header_bar');
-  const headerHeight = `${header.clientHeight}px`;
-  document.documentElement.style.setProperty('--header-height', headerHeight);
-
-  // Offset the scroll position of the chat area
-  if (chatInputHeight !== currentChatInputHeight) {
-    chatContainer.scrollTop += chatInputHeight > currentChatInputHeight ? chatInputHeight : -chatInputHeight;
-    currentChatInputHeight = chatInputHeight;
+  const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
+  const chatInputHeight = document.querySelector("#chat-input textarea").clientHeight;
+  if (chatContainer.clientHeight > 0) {
+    const newChatHeight = `${chatContainer.clientHeight - chatInputHeight + 40}px`;
+    document.documentElement.style.setProperty("--chat-height", newChatHeight);
+    document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
+
+    // Set the position offset of the chat input box
+    const header = document.querySelector(".header_bar");
+    const headerHeight = `${header.clientHeight}px`;
+    document.documentElement.style.setProperty("--header-height", headerHeight);
+
+    // Offset the scroll position of the chat area
+    if (chatInputHeight !== currentChatInputHeight) {
+      chatContainer.scrollTop += chatInputHeight > currentChatInputHeight ? chatInputHeight : -chatInputHeight;
+      currentChatInputHeight = chatInputHeight;
+    }
   }
 }
 
 new ResizeObserver(updateCssProperties)
-  .observe(document.querySelector('#chat-input textarea'));
+  .observe(document.querySelector("#chat-input textarea"));
 
-window.addEventListener('resize', updateCssProperties);
+window.addEventListener("resize", updateCssProperties);
diff --git a/js/show_controls.js b/js/show_controls.js
@@ -12,9 +12,9 @@ function toggle_controls(value) {
     document.getElementById("chat-col").classList.remove("bigchat");
     document.getElementById("chat-tab").style.paddingBottom = "";
 
-    let gallery_element = document.getElementById('gallery-extension');
+    let gallery_element = document.getElementById("gallery-extension");
     if (gallery_element) {
-      gallery_element.style.display = 'block';
+      gallery_element.style.display = "block";
     }
 
   } else {

diff --git a/js/update_big_picture.js b/js/update_big_picture.js
@@ -1,5 +1,5 @@
 function updateBigPicture() {
-  var existingElement = document.querySelector('.bigProfilePicture');
+  var existingElement = document.querySelector(".bigProfilePicture");
   if (existingElement) {
     var timestamp = new Date().getTime();
     existingElement.src = "/file/cache/pfp_character.png?time=" + timestamp;
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,6 +30,7 @@ @@
     venv
     .envrc
     .direnv
+    .vs
     .vscode
     *.bak
     *.ipynb
@@ Expand Down @@