Merge branch 'master' into cublas_prompt

EricLBuehler · May 15, 2024 · 0979c5f · 0979c5f
2 parents 64f656b + b616e44
commit 0979c5f
Show file tree

Hide file tree

Showing 60 changed files with 683 additions and 756 deletions.
diff --git a/.github/workflows/analysis.yaml b/.github/workflows/analysis.yaml
@@ -0,0 +1,64 @@
+name: Analysis
+on:
+  pull_request_target
+
+jobs:
+  comment:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install Rust and Cargo
+        run: |
+          curl -sSf https://sh.rustup.rs | sh -s -- -y
+          source $HOME/.cargo/env
+
+      - name: Install Tokei
+        run: cargo install tokei
+
+      - name: Run Tokei and get the lines of code
+        run: tokei . > tokei_output.txt
+
+      - name: Comment or Update PR
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const fs = require('fs');
+            const tokeiOutput = fs.readFileSync('tokei_output.txt', 'utf8');
+            const uniqueIdentifier = 'Code Metrics Report'; 
+            const codeReport = `
+              <details>
+              <summary>${uniqueIdentifier}</summary>
+              <pre>
+              ${tokeiOutput}
+              </pre>
+              </details>
+            `;
+
+            const issue_number = context.issue.number;
+            const { owner, repo } = context.repo;
+
+            const comments = await github.rest.issues.listComments({
+              issue_number,
+              owner,
+              repo
+            });
+
+            const existingComment = comments.data.find(comment => comment.body.includes(uniqueIdentifier));
+
+            if (existingComment) {
+              await github.rest.issues.updateComment({
+                owner,
+                repo,
+                comment_id: existingComment.id,
+                body: codeReport
+              });
+            } else {
+              await github.rest.issues.createComment({
+                issue_number,
+                owner,
+                repo,
+                body: codeReport
+              });
+            }
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,7 +10,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.1.6"
+version = "0.1.7"
 edition = "2021"
 description = "Fast and easy LLM serving."
 homepage = "https://github.com/EricLBuehler/mistral.rs"

diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,6 @@
 FROM rust:latest as builder
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-    python3-dev \
     && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /mistralrs
@@ -24,8 +23,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     libssl-dev \
     curl \
     pkg-config \
-    python3 \
-    python3-dev \
     && rm -rf /var/lib/apt/lists/*
 
 FROM base

diff --git a/Dockerfile-cuda-all b/Dockerfile-cuda-all
@@ -1,12 +1,9 @@
-FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04 AS builder
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS builder
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     curl \
     libssl-dev \
     pkg-config \
-    python3 \
-    python3-pip \
-    python3-dev \
     && rm -rf /var/lib/apt/lists/*
 
 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
@@ -24,19 +21,28 @@ ARG FEATURES="cuda cudnn"
 ENV RAYON_NUM_THREADS=4
 RUN RUSTFLAGS="-Z threads=4" cargo build --release --workspace --exclude mistralrs-pyo3 --features "${FEATURES}"
 
-FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04 as base
+FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 as base
 
 ENV HUGGINGFACE_HUB_CACHE=/data \
     PORT=80 \
-    RAYON_NUM_THREADS=8
+    RAYON_NUM_THREADS=8 \ 
+    LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Run the script to create symlinks in /usr/local/cuda/lib64
+RUN set -eux; \
+    for lib in $(ls /usr/local/cuda/lib64); do \
+        base=$(echo $lib | sed -r 's/(.+)\.so\..+/\1.so/'); \
+        if [ "$lib" != "$base" ]; then \
+            ln -sf "/usr/local/cuda/lib64/$lib" "/usr/local/cuda/lib64/$base"; \
+        fi; \
+    done
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     libomp-dev \
     ca-certificates \
     libssl-dev \
     curl \
     pkg-config \
-    libpython3.10-dev \
     && rm -rf /var/lib/apt/lists/*
 
 FROM base

diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ Mistral.rs is a fast LLM inference platform supporting inference on a variety of
 - More models: please submit requests [here](https://github.com/EricLBuehler/mistral.rs/issues/156).
 - X-LoRA: Scalings `topk` and softmax `topk` ([#48](https://github.com/EricLBuehler/mistral.rs/issues/48)).
 - Parallel linear layers (sharding) ([#50](https://github.com/EricLBuehler/mistral.rs/issues/50)).
-- Speculative decoding: https://arxiv.org/pdf/2211.17192
+- Vision models: Idefics 2 ([#309](https://github.com/EricLBuehler/mistral.rs/pull/309)).
 
 **Running the new Llama 3 model**
 
@@ -252,18 +252,20 @@ or
 ./mistralrs-server gguf -m . -t . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
 ```
 
-The following files must be present in the paths for the options below:
-- `--model-id` (server) or `model_id` (python) or `--tok-model-id` (server) or `tok_model_id` (python): 
+Throughout mistral.rs, any model ID argument or option may be a local path and should contain the following files for each model ID option:
+- `--model-id` (server) or `model_id` (python/rust) or `--tok-model-id` (server) or `tok_model_id` (python/rust): 
   - `config.json`
   - `tokenizer_config.json`
   - `tokenizer.json` (if not specified separately)
   - `.safetensors` files.
-- `--quantized-model-id` (server) or `quantized_model_id` (python):
+- `--quantized-model-id` (server) or `quantized_model_id` (python/rust):
   - Specified `.gguf` or `.ggml` file.
-- `--x-lora-model-id` (server) or `xlora_model_id` (python):
+- `--x-lora-model-id` (server) or `xlora_model_id` (python/rust):
   - `xlora_classifier.safetensors`
   - `xlora_config.json`
   - Adapters `.safetensors` and `adapter_config.json` files in their respective directories
+- `--adapters-model-id` (server) or `adapters_model_id` (python/rust):
+  - Adapters `.safetensors` and `adapter_config.json` files in their respective directories
 
 ### Run
 

diff --git a/docs/ADAPTER_MODELS.md b/docs/ADAPTER_MODELS.md
@@ -25,7 +25,14 @@ When using an adapter model with a quantized base model, if the ordering file sp
 **Preparing the X-LoRA/LoRA Ordering File**
 The X-LoRA/LoRA ordering file is necessary to prepare before inference with an X-LoRA model. However, it is easy with a provided [`script`](../scripts/create_ordering.py)!
 
-The X-LoRA/LoRA ordering JSON file contains 2 parts. The first is the order of the adapters and the second, the layer ordering. The layer ordering has been automatically generated and should not be manipulated as it controls the application of scalings. However the order of adapter should be an array of strings which are the adapter names corresponding to the order the adapters were specified during training. For example, if the adapters were specified as a dictionary:
+### X-LoRA case
+An ordering JSON file for X-LoRA contains 2 major parts. 
+
+1) The adapter names `order`
+    - The order matters!
+    - Should be an array of strings which are the adapter names corresponding to the order the adapters were specified during training. For example, if the adapters were specified as a dictionary:
+2) The layer ordering `layers`
+    - Automatically generated and should not be manipulated as it controls the application of scalings. 
 
 ```python
 adapters = {
@@ -37,9 +44,20 @@ adapters = {
 
 The specified order would be `["math", "reasoning", "biology"]`.
 
-For LoRA models, the order of the adapters does not matter. You can reorder them or remove some to control which adapters will be used. However, for an X-LoRA model, the order of the adapters in the ordering file is important.
+We provide an [ordering file](../orderings/xlora-paper-ordering.json) which contains the ordering for the X-LoRA model associated with [the paper](https://arxiv.org/abs/2402.07148) and the Huggingface repository: https://huggingface.co/lamm-mit/x-lora.
+
+### LoRA case
+An ordering JSON file for LoRA contains 2 major parts:
+1) The adapter names `order` (optional):
+    - The order does not matter
+    - Come controls which adapters will be initially activated
+    - If this key is not specified, then no adapters will be activated initially
+2) Preload adapter section `preload_adapters` (optional): [see this section](#adapter-model-dynamic-adapter-activation)
+    - Order does not matter
+    - Specifies the adapter name and the model ID to find them, which may be a local path.
 
-There are 2 scripts to prepare the ordering file. The ordering file is specific to each architecture and set of target modules. Therefore, if either are changed, it is necessary to create a new ordering file using the first option. If only the adapter order or adapters changed, then it the second option should be used.
+### Preparing the ordering file (LoRA or X-LoRA cases)
+There are 2 scripts to prepare the ordering file and which work for both X-LoRA and LoRA. The ordering file is specific to each architecture and set of target modules. Therefore, if either are changed, it is necessary to create a new ordering file using the first option. If only the adapter order or adapters changed, then it the second option should be used.
 
 1) From scratch: No ordering file for the architecture and target modules
 
@@ -49,11 +67,11 @@ There are 2 scripts to prepare the ordering file. The ordering file is specific
 
     A script [`set_names.py`](../scripts/set_names.py) is provided which prompts the user for the adapter names and the old ordering file. The user is prompted for an output file location, relative to the working directory.
 
-We provide an [ordering file](../orderings/xlora-paper-ordering.json) which contains the ordering for the X-LoRA model associated with [the paper](https://arxiv.org/abs/2402.07148) and the Huggingface repository: https://huggingface.co/lamm-mit/x-lora.
+### Quantized X-LoRA or LoRA models
 
-**Quantized X-LoRA or LoRA models**
+Mistral.rs supports running quantized models with X-LoRA or LoRA. The X-LoRA or LoRA adapter layers will not be quantized, only the base model. P
 
-Mistral.rs supports running quantized models with X-LoRA or LoRA. The X-LoRA or LoRA adapter layers will not be quantized, only the base model. Please note that using a high quantization level (eg., 4-bit) can distort the signal and prevent the classifier from acting properly. Therefore, it is better to use slightly lower levels such as 8-bit.
+In the X-LoRA case, please note that using a high quantization level (eg., 4-bit) can distort the signal and prevent the classifier from acting properly. Therefore, it is better to use slightly lower levels such as 8-bit.
 
 
 ## Avoiding the scaling pass with non-granular scalings

diff --git a/docs/CMD_LINE_DOCS.md b/docs/CMD_LINE_DOCS.md
@@ -97,7 +97,7 @@ Options:
 
 ## For X-LoRA and quantized models
 
-This is an example which is roughly the same for all adapter + quantized models. This is specifically for: `./mistralrs_server x-lora-gguf --help`
+This is an example similar to X-LoRA/LoRA + quantized models. This is specifically for: `./mistralrs_server x-lora-gguf --help`
 
 ```bash
 Select a GGUF model with X-LoRA

diff --git a/docs/ISQ.md b/docs/ISQ.md
@@ -16,7 +16,11 @@ Possible values for ISQ quantization:
 - Q6K
 - Q8K
 
-When using ISQ, it will automatically load non ISQ-able weights into CPU memory before applying ISQ. The ISQ application process moves the weights to device memory. This process is implemented to avoid memory spikes from loading the model in full precision.
+When using ISQ, it will automatically load ISQ-able weights into CPU memory before applying ISQ. The ISQ application process moves the weights to device memory. This process is implemented to avoid memory spikes from loading the model in full precision.
+
+If a tensor cannot be quantized, the fallback process is as follows:
+1) If using a `K` quant, fallback to a similar `Q` quant.
+2) If that is not possible, use `F32` as the data type.
 
 ## Python Example
 ```python

diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml
@@ -17,7 +17,7 @@ candle-core.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 clap.workspace = true
-mistralrs-core = { version = "0.1.6", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.1.7", path = "../mistralrs-core" }
 tracing.workspace = true
 tracing-subscriber.workspace = true
 either.workspace = true

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -26,7 +26,7 @@ tokenizers = "0.15.2"
 tqdm = "0.7.0"
 range-checked = { git = "https://github.com/EricLBuehler/range-checked.git", version = "0.1.0" }
 chrono = "0.4.34"
-mistralrs-lora = { version = "0.1.6", path = "../mistralrs-lora" }
+mistralrs-lora = { version = "0.1.7", path = "../mistralrs-lora" }
 minijinja = "1.0.12"
 either.workspace = true
 indexmap.workspace = true
@@ -44,19 +44,20 @@ galil-seiferas = "0.1.5"
 clap.workspace = true
 radix_trie = "0.2.1"
 bytemuck = "1.15.0"
-pyo3.workspace = true
 rayon = "1.10.0"
 tokio.workspace = true
 tokio-rayon = "2.1.0"
 rand_isaac = "0.3.0"
 futures.workspace = true
+pyo3 = {workspace = true, optional = true }
 indicatif = { version = "0.17.8", features = ["rayon"] }
 async-trait = "0.1.80"
 once_cell = "1.19.0"
 toml = "0.8.12"
-ctrlc = "3.4.4"
+
 
 [features]
+pyo3_macros = ["pyo3"]
 cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
 cudnn = ["candle-core/cudnn"]
 metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]

diff --git a/mistralrs-core/src/engine/mod.rs b/mistralrs-core/src/engine/mod.rs
@@ -1,9 +1,6 @@
 use std::{
     collections::{HashMap, VecDeque},
-    sync::{
-        atomic::{AtomicBool, Ordering},
-        Arc,
-    },
+    sync::{atomic::AtomicBool, Arc},
     time::{Instant, SystemTime, UNIX_EPOCH},
 };
 use tokio::sync::{mpsc::Receiver, Mutex};
@@ -33,7 +30,8 @@ use crate::{
 };
 
 const SEED: u64 = 0;
-pub(crate) static TERMINATE_ALL_NEXT_STEP: AtomicBool = AtomicBool::new(false);
+/// Terminate all sequences on the next scheduling step. Be sure to reset this.
+pub static TERMINATE_ALL_NEXT_STEP: AtomicBool = AtomicBool::new(false);
 
 pub struct Engine {
     rx: Receiver<Request>,
@@ -58,16 +56,9 @@ impl Engine {
         no_prefix_cache: bool,
         prefix_cache_n: usize,
         disable_eos_stop: bool,
-        interactive: bool,
     ) -> Self {
         let device = get_mut_arcmutex!(pipeline).device().clone();
         let is_xlora = get_mut_arcmutex!(pipeline).get_metadata().is_xlora;
-        if interactive {
-            ctrlc::set_handler(move || {
-                TERMINATE_ALL_NEXT_STEP.store(true, Ordering::SeqCst);
-            })
-            .expect("Failed to set CTRL-C handler for interactive mode");
-        }
         Self {
             rx,
             pipeline,

diff --git a/mistralrs-core/src/layers.rs b/mistralrs-core/src/layers.rs
@@ -365,3 +365,35 @@ impl CausalMasker {
         }
     }
 }
+
+#[cfg(feature = "flash-attn")]
+pub fn flash_attn(
+    q: &Tensor,
+    k: &Tensor,
+    v: &Tensor,
+    softmax_scale: f32,
+    causal: bool,
+) -> Result<Tensor> {
+    candle_flash_attn::flash_attn(q, k, v, softmax_scale, causal)
+}
+
+#[cfg(not(feature = "flash-attn"))]
+pub fn flash_attn(_: &Tensor, _: &Tensor, _: &Tensor, _: f32, _: bool) -> Result<Tensor> {
+    unimplemented!("Compile with '--features flash-attn'")
+}
+
+pub fn verify_sanity_gguf(arch: &str, expected_arch: &str) -> Result<()> {
+    if arch != expected_arch {
+        candle_core::bail!("Expected `{expected_arch}` architecture, got `{arch}`.");
+    }
+    Ok(())
+}
+
+pub fn repeat_kv(x: Tensor, n_rep: usize) -> Result<Tensor> {
+    if n_rep == 1 {
+        Ok(x)
+    } else {
+        let (b_sz, n_kv_head, seq_len, head_dim) = x.dims4()?;
+        Tensor::cat(&vec![&x; n_rep], 2)?.reshape((b_sz, n_kv_head * n_rep, seq_len, head_dim))
+    }
+}