Skip to content

Commit

Permalink
Merge branch 'master' into cublas_prompt
Browse files Browse the repository at this point in the history
  • Loading branch information
EricLBuehler authored May 15, 2024
2 parents 64f656b + b616e44 commit 0979c5f
Show file tree
Hide file tree
Showing 60 changed files with 683 additions and 756 deletions.
64 changes: 64 additions & 0 deletions .github/workflows/analysis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: Analysis
on:
pull_request_target

jobs:
comment:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Install Rust and Cargo
run: |
curl -sSf https://sh.rustup.rs | sh -s -- -y
source $HOME/.cargo/env
- name: Install Tokei
run: cargo install tokei

- name: Run Tokei and get the lines of code
run: tokei . > tokei_output.txt

- name: Comment or Update PR
uses: actions/github-script@v6
with:
script: |
const fs = require('fs');
const tokeiOutput = fs.readFileSync('tokei_output.txt', 'utf8');
const uniqueIdentifier = 'Code Metrics Report';
const codeReport = `
<details>
<summary>${uniqueIdentifier}</summary>
<pre>
${tokeiOutput}
</pre>
</details>
`;
const issue_number = context.issue.number;
const { owner, repo } = context.repo;
const comments = await github.rest.issues.listComments({
issue_number,
owner,
repo
});
const existingComment = comments.data.find(comment => comment.body.includes(uniqueIdentifier));
if (existingComment) {
await github.rest.issues.updateComment({
owner,
repo,
comment_id: existingComment.id,
body: codeReport
});
} else {
await github.rest.issues.createComment({
issue_number,
owner,
repo,
body: codeReport
});
}
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ members = [
resolver = "2"

[workspace.package]
version = "0.1.6"
version = "0.1.7"
edition = "2021"
description = "Fast and easy LLM serving."
homepage = "https://github.com/EricLBuehler/mistral.rs"
Expand Down
3 changes: 0 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
FROM rust:latest as builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
python3-dev \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /mistralrs
Expand All @@ -24,8 +23,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
libssl-dev \
curl \
pkg-config \
python3 \
python3-dev \
&& rm -rf /var/lib/apt/lists/*

FROM base
Expand Down
20 changes: 13 additions & 7 deletions Dockerfile-cuda-all
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04 AS builder
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS builder

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
curl \
libssl-dev \
pkg-config \
python3 \
python3-pip \
python3-dev \
&& rm -rf /var/lib/apt/lists/*

RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
Expand All @@ -24,19 +21,28 @@ ARG FEATURES="cuda cudnn"
ENV RAYON_NUM_THREADS=4
RUN RUSTFLAGS="-Z threads=4" cargo build --release --workspace --exclude mistralrs-pyo3 --features "${FEATURES}"

FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04 as base
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 as base

ENV HUGGINGFACE_HUB_CACHE=/data \
PORT=80 \
RAYON_NUM_THREADS=8
RAYON_NUM_THREADS=8 \
LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Run the script to create symlinks in /usr/local/cuda/lib64
RUN set -eux; \
for lib in $(ls /usr/local/cuda/lib64); do \
base=$(echo $lib | sed -r 's/(.+)\.so\..+/\1.so/'); \
if [ "$lib" != "$base" ]; then \
ln -sf "/usr/local/cuda/lib64/$lib" "/usr/local/cuda/lib64/$base"; \
fi; \
done

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
libomp-dev \
ca-certificates \
libssl-dev \
curl \
pkg-config \
libpython3.10-dev \
&& rm -rf /var/lib/apt/lists/*

FROM base
Expand Down
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Mistral.rs is a fast LLM inference platform supporting inference on a variety of
- More models: please submit requests [here](https://github.com/EricLBuehler/mistral.rs/issues/156).
- X-LoRA: Scalings `topk` and softmax `topk` ([#48](https://github.com/EricLBuehler/mistral.rs/issues/48)).
- Parallel linear layers (sharding) ([#50](https://github.com/EricLBuehler/mistral.rs/issues/50)).
- Speculative decoding: https://arxiv.org/pdf/2211.17192
- Vision models: Idefics 2 ([#309](https://github.com/EricLBuehler/mistral.rs/pull/309)).

**Running the new Llama 3 model**

Expand Down Expand Up @@ -252,18 +252,20 @@ or
./mistralrs-server gguf -m . -t . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
```

The following files must be present in the paths for the options below:
- `--model-id` (server) or `model_id` (python) or `--tok-model-id` (server) or `tok_model_id` (python):
Throughout mistral.rs, any model ID argument or option may be a local path and should contain the following files for each model ID option:
- `--model-id` (server) or `model_id` (python/rust) or `--tok-model-id` (server) or `tok_model_id` (python/rust):
- `config.json`
- `tokenizer_config.json`
- `tokenizer.json` (if not specified separately)
- `.safetensors` files.
- `--quantized-model-id` (server) or `quantized_model_id` (python):
- `--quantized-model-id` (server) or `quantized_model_id` (python/rust):
- Specified `.gguf` or `.ggml` file.
- `--x-lora-model-id` (server) or `xlora_model_id` (python):
- `--x-lora-model-id` (server) or `xlora_model_id` (python/rust):
- `xlora_classifier.safetensors`
- `xlora_config.json`
- Adapters `.safetensors` and `adapter_config.json` files in their respective directories
- `--adapters-model-id` (server) or `adapters_model_id` (python/rust):
- Adapters `.safetensors` and `adapter_config.json` files in their respective directories

### Run

Expand Down
30 changes: 24 additions & 6 deletions docs/ADAPTER_MODELS.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@ When using an adapter model with a quantized base model, if the ordering file sp
**Preparing the X-LoRA/LoRA Ordering File**
The X-LoRA/LoRA ordering file is necessary to prepare before inference with an X-LoRA model. However, it is easy with a provided [`script`](../scripts/create_ordering.py)!

The X-LoRA/LoRA ordering JSON file contains 2 parts. The first is the order of the adapters and the second, the layer ordering. The layer ordering has been automatically generated and should not be manipulated as it controls the application of scalings. However the order of adapter should be an array of strings which are the adapter names corresponding to the order the adapters were specified during training. For example, if the adapters were specified as a dictionary:
### X-LoRA case
An ordering JSON file for X-LoRA contains 2 major parts.

1) The adapter names `order`
- The order matters!
- Should be an array of strings which are the adapter names corresponding to the order the adapters were specified during training. For example, if the adapters were specified as a dictionary:
2) The layer ordering `layers`
- Automatically generated and should not be manipulated as it controls the application of scalings.

```python
adapters = {
Expand All @@ -37,9 +44,20 @@ adapters = {

The specified order would be `["math", "reasoning", "biology"]`.

For LoRA models, the order of the adapters does not matter. You can reorder them or remove some to control which adapters will be used. However, for an X-LoRA model, the order of the adapters in the ordering file is important.
We provide an [ordering file](../orderings/xlora-paper-ordering.json) which contains the ordering for the X-LoRA model associated with [the paper](https://arxiv.org/abs/2402.07148) and the Huggingface repository: https://huggingface.co/lamm-mit/x-lora.

### LoRA case
An ordering JSON file for LoRA contains 2 major parts:
1) The adapter names `order` (optional):
- The order does not matter
- Come controls which adapters will be initially activated
- If this key is not specified, then no adapters will be activated initially
2) Preload adapter section `preload_adapters` (optional): [see this section](#adapter-model-dynamic-adapter-activation)
- Order does not matter
- Specifies the adapter name and the model ID to find them, which may be a local path.

There are 2 scripts to prepare the ordering file. The ordering file is specific to each architecture and set of target modules. Therefore, if either are changed, it is necessary to create a new ordering file using the first option. If only the adapter order or adapters changed, then it the second option should be used.
### Preparing the ordering file (LoRA or X-LoRA cases)
There are 2 scripts to prepare the ordering file and which work for both X-LoRA and LoRA. The ordering file is specific to each architecture and set of target modules. Therefore, if either are changed, it is necessary to create a new ordering file using the first option. If only the adapter order or adapters changed, then it the second option should be used.

1) From scratch: No ordering file for the architecture and target modules

Expand All @@ -49,11 +67,11 @@ There are 2 scripts to prepare the ordering file. The ordering file is specific

A script [`set_names.py`](../scripts/set_names.py) is provided which prompts the user for the adapter names and the old ordering file. The user is prompted for an output file location, relative to the working directory.

We provide an [ordering file](../orderings/xlora-paper-ordering.json) which contains the ordering for the X-LoRA model associated with [the paper](https://arxiv.org/abs/2402.07148) and the Huggingface repository: https://huggingface.co/lamm-mit/x-lora.
### Quantized X-LoRA or LoRA models

**Quantized X-LoRA or LoRA models**
Mistral.rs supports running quantized models with X-LoRA or LoRA. The X-LoRA or LoRA adapter layers will not be quantized, only the base model. P

Mistral.rs supports running quantized models with X-LoRA or LoRA. The X-LoRA or LoRA adapter layers will not be quantized, only the base model. Please note that using a high quantization level (eg., 4-bit) can distort the signal and prevent the classifier from acting properly. Therefore, it is better to use slightly lower levels such as 8-bit.
In the X-LoRA case, please note that using a high quantization level (eg., 4-bit) can distort the signal and prevent the classifier from acting properly. Therefore, it is better to use slightly lower levels such as 8-bit.


## Avoiding the scaling pass with non-granular scalings
Expand Down
2 changes: 1 addition & 1 deletion docs/CMD_LINE_DOCS.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ Options:
## For X-LoRA and quantized models
This is an example which is roughly the same for all adapter + quantized models. This is specifically for: `./mistralrs_server x-lora-gguf --help`
This is an example similar to X-LoRA/LoRA + quantized models. This is specifically for: `./mistralrs_server x-lora-gguf --help`
```bash
Select a GGUF model with X-LoRA
Expand Down
6 changes: 5 additions & 1 deletion docs/ISQ.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ Possible values for ISQ quantization:
- Q6K
- Q8K

When using ISQ, it will automatically load non ISQ-able weights into CPU memory before applying ISQ. The ISQ application process moves the weights to device memory. This process is implemented to avoid memory spikes from loading the model in full precision.
When using ISQ, it will automatically load ISQ-able weights into CPU memory before applying ISQ. The ISQ application process moves the weights to device memory. This process is implemented to avoid memory spikes from loading the model in full precision.

If a tensor cannot be quantized, the fallback process is as follows:
1) If using a `K` quant, fallback to a similar `Q` quant.
2) If that is not possible, use `F32` as the data type.

## Python Example
```python
Expand Down
2 changes: 1 addition & 1 deletion mistralrs-bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ candle-core.workspace = true
serde.workspace = true
serde_json.workspace = true
clap.workspace = true
mistralrs-core = { version = "0.1.6", path = "../mistralrs-core" }
mistralrs-core = { version = "0.1.7", path = "../mistralrs-core" }
tracing.workspace = true
tracing-subscriber.workspace = true
either.workspace = true
Expand Down
7 changes: 4 additions & 3 deletions mistralrs-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ tokenizers = "0.15.2"
tqdm = "0.7.0"
range-checked = { git = "https://github.com/EricLBuehler/range-checked.git", version = "0.1.0" }
chrono = "0.4.34"
mistralrs-lora = { version = "0.1.6", path = "../mistralrs-lora" }
mistralrs-lora = { version = "0.1.7", path = "../mistralrs-lora" }
minijinja = "1.0.12"
either.workspace = true
indexmap.workspace = true
Expand All @@ -44,19 +44,20 @@ galil-seiferas = "0.1.5"
clap.workspace = true
radix_trie = "0.2.1"
bytemuck = "1.15.0"
pyo3.workspace = true
rayon = "1.10.0"
tokio.workspace = true
tokio-rayon = "2.1.0"
rand_isaac = "0.3.0"
futures.workspace = true
pyo3 = {workspace = true, optional = true }
indicatif = { version = "0.17.8", features = ["rayon"] }
async-trait = "0.1.80"
once_cell = "1.19.0"
toml = "0.8.12"
ctrlc = "3.4.4"


[features]
pyo3_macros = ["pyo3"]
cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
cudnn = ["candle-core/cudnn"]
metal = ["candle-core/metal", "candle-nn/metal", "candle-transformers/metal"]
Expand Down
15 changes: 3 additions & 12 deletions mistralrs-core/src/engine/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
use std::{
collections::{HashMap, VecDeque},
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
sync::{atomic::AtomicBool, Arc},
time::{Instant, SystemTime, UNIX_EPOCH},
};
use tokio::sync::{mpsc::Receiver, Mutex};
Expand Down Expand Up @@ -33,7 +30,8 @@ use crate::{
};

const SEED: u64 = 0;
pub(crate) static TERMINATE_ALL_NEXT_STEP: AtomicBool = AtomicBool::new(false);
/// Terminate all sequences on the next scheduling step. Be sure to reset this.
pub static TERMINATE_ALL_NEXT_STEP: AtomicBool = AtomicBool::new(false);

pub struct Engine {
rx: Receiver<Request>,
Expand All @@ -58,16 +56,9 @@ impl Engine {
no_prefix_cache: bool,
prefix_cache_n: usize,
disable_eos_stop: bool,
interactive: bool,
) -> Self {
let device = get_mut_arcmutex!(pipeline).device().clone();
let is_xlora = get_mut_arcmutex!(pipeline).get_metadata().is_xlora;
if interactive {
ctrlc::set_handler(move || {
TERMINATE_ALL_NEXT_STEP.store(true, Ordering::SeqCst);
})
.expect("Failed to set CTRL-C handler for interactive mode");
}
Self {
rx,
pipeline,
Expand Down
32 changes: 32 additions & 0 deletions mistralrs-core/src/layers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -365,3 +365,35 @@ impl CausalMasker {
}
}
}

#[cfg(feature = "flash-attn")]
pub fn flash_attn(
q: &Tensor,
k: &Tensor,
v: &Tensor,
softmax_scale: f32,
causal: bool,
) -> Result<Tensor> {
candle_flash_attn::flash_attn(q, k, v, softmax_scale, causal)
}

#[cfg(not(feature = "flash-attn"))]
pub fn flash_attn(_: &Tensor, _: &Tensor, _: &Tensor, _: f32, _: bool) -> Result<Tensor> {
unimplemented!("Compile with '--features flash-attn'")
}

pub fn verify_sanity_gguf(arch: &str, expected_arch: &str) -> Result<()> {
if arch != expected_arch {
candle_core::bail!("Expected `{expected_arch}` architecture, got `{arch}`.");
}
Ok(())
}

pub fn repeat_kv(x: Tensor, n_rep: usize) -> Result<Tensor> {
if n_rep == 1 {
Ok(x)
} else {
let (b_sz, n_kv_head, seq_len, head_dim) = x.dims4()?;
Tensor::cat(&vec![&x; n_rep], 2)?.reshape((b_sz, n_kv_head * n_rep, seq_len, head_dim))
}
}
Loading

0 comments on commit 0979c5f

Please sign in to comment.