Skip to content

Commit

Permalink
Merge pull request #73 from opendatahub-io/main
Browse files Browse the repository at this point in the history
Sync odh/main with odh/release
  • Loading branch information
openshift-merge-bot[bot] authored Apr 23, 2024
2 parents daaa6b6 + 102f77d commit 795c087
Show file tree
Hide file tree
Showing 23 changed files with 3,347 additions and 1,197 deletions.
491 changes: 288 additions & 203 deletions Cargo.lock

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## Global Args #################################################################
ARG BASE_UBI_IMAGE_TAG=9.3-1610
ARG PROTOC_VERSION=25.2
ARG PROTOC_VERSION=25.3
ARG PYTORCH_INDEX="https://download.pytorch.org/whl"
# ARG PYTORCH_INDEX="https://download.pytorch.org/whl/nightly"
ARG AUTO_GPTQ_VERSION=0.7.1
Expand Down Expand Up @@ -86,7 +86,7 @@ ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"

## Rust builder ################################################################
# Specific debian version so that compatible glibc version is used
FROM rust:1.77-bullseye as rust-builder
FROM rust:1.77.2-bullseye as rust-builder
ARG PROTOC_VERSION

ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
Expand Down Expand Up @@ -164,6 +164,9 @@ RUN cd server && \
make gen-server && \
pip install ".[accelerate]" --no-cache-dir

# temp: install newer transformers lib that optimum clashes with
RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir

# Patch codegen model changes into transformers
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py

Expand Down Expand Up @@ -288,6 +291,9 @@ COPY server server
# Ref: https://onnxruntime.ai/docs/install/#install-onnx-runtime-gpu-cuda-12x
RUN cd server && make gen-server && pip install ".[accelerate, ibm-fms, onnx-gpu, quantize]" --no-cache-dir --extra-index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/

# temp: install newer transformers lib that optimum clashes with
RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir

# Patch codegen model changes into transformers 4.35
RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py

Expand Down
2 changes: 1 addition & 1 deletion integration_tests/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
gen-client:
# Compile protos
pip install grpcio-tools==1.60.0 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' --no-cache-dir
pip install grpcio-tools==1.62.2 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' --no-cache-dir
mkdir text_generation_tests/pb || true
python -m grpc_tools.protoc -I../proto --python_out=text_generation_tests/pb \
--grpc_python_out=text_generation_tests/pb --mypy_out=text_generation_tests/pb ../proto/generation.proto
Expand Down
611 changes: 303 additions & 308 deletions integration_tests/poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion integration_tests/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ python = ">=3.11"

[tool.poetry.group.dev.dependencies]
protobuf = "^4.25.3"
grpcio-tools = "^1.62.1"
grpcio-tools = "^1.62.2"
pytest = "^8.1.1"
pytest-asyncio = "^0.23.6"
requests = "^2.31.0"
Expand Down
85 changes: 85 additions & 0 deletions integration_tests/sample_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import time
import grpc
from google.protobuf import json_format
from text_generation_tests.pb import generation_pb2_grpc as gpb2, generation_pb2 as pb2


def get_streaming_response_tgis(response):
stop = False
generated_tokens = 0
while not stop:
try:
x = next(response)
timestamp = time.time_ns()
data = json_format.MessageToDict(x)
# skip first response (tokenizer output only)
if "inputTokenCount" not in data:
n_tokens = data["generatedTokenCount"] - generated_tokens
generated_tokens = data["generatedTokenCount"]
yield data, n_tokens, timestamp, True, None
except Exception as e:
timestamp = time.time_ns()
yield None, 0, timestamp, False, e


channel = grpc.insecure_channel("localhost:8033")
stub = gpb2.GenerationServiceStub(channel)
max_new_tokens = 100

template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:"
num_req = 0
while True:
prompt_input = input(f"\n{num_req}) Enter a prompt:\n")

print("-" * 40)
print("Output:")
prompt = template.format(prompt_input)
sample_request = {
"model_id": "dummy-model-name",
"request": {"text": prompt},
"params": {
"method": "GREEDY",
"stopping": {
"max_new_tokens": max_new_tokens,
"min_new_tokens": max_new_tokens,
},
},
}
message = json_format.ParseDict(sample_request, pb2.SingleGenerationRequest())
output = []
total_time = 0
response = stub.GenerateStream(message)
response_generator = get_streaming_response_tgis(response)
t0 = time.time_ns()
response = ""
stop = False
while not stop:
r, n_tokens, t, ok, err = next(response_generator)

if not ok:
stop = True
# check if we have reached end of stream
if type(err) is StopIteration:
continue
duration = (t - t0) / 1000.0 / 1000.0
record = {
"response": r,
"ok": ok,
"error": str(err),
"timestamp": t,
"duration_ms": duration,
"n_tokens": n_tokens,
}
total_time += duration
response += r["text"]
output.append(record)
t0 = t

# print(json.dumps(output, indent=4))
print("-" * 40)
print(response)
print("-" * 40)
print(f"Total_time : {total_time}ms")
print(f"Time_per_token : {total_time/max_new_tokens}ms")
print("-" * 40)
num_req += 1
4 changes: 2 additions & 2 deletions launcher/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ authors = ["Olivier Dehaene"]
description = "Text Generation Launcher"

[dependencies]
clap = { version = "4.5.3", features = ["derive", "env"] }
clap = { version = "4.5.4", features = ["derive", "env"] }
ctrlc = { version = "3.4.4", features = ["termination"] }
nix = { version = "0.28.0", features = ["process", "signal"] }
serde_json = "^1.0.114"
serde_json = "^1.0.11"
tracing = "0.1.40"
tracing-subscriber = { version = "0.3.18", features = ["json", "env-filter"] }
uuid = { version = "1.8.0", features = ["v4", "fast-rng"] }
25 changes: 13 additions & 12 deletions router/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,34 @@ path = "src/main.rs"
axum = { version = "0.6.20", features = ["json"] }
axum-tracing-opentelemetry = "0.10.0"
text-generation-client = { path = "client" }
clap = { version = "^4.5.2", features = ["derive", "env"] }
clap = { version = "^4.5.4", features = ["derive", "env"] }
futures = "^0.3.30"
flume = "^0.11.0"
metrics = "0.21.1"
metrics-exporter-prometheus = { version = "0.12.2", features = [] }
moka = { version = "0.12.5", features = ["future"] }
moka = { version = "0.12.6", features = ["future"] }
nohash-hasher = "^0.2.0"
num = "^0.4.1"
num = "^0.4.2"
num_cpus = "^1.16.0"
hyper = "^0.14.28" # Override to address CVE-2023-26964
h2 = "^0.3.26 " # Override to address CVEs
openssl = "^0.10.64" # Override to address WS-2023-0082, WS-2023-0083, WS-2023-0195
openssl-sys = "^0.9.101" # Override to address WS-2023-0082, WS-2023-0083, WS-2023-0195
openssl-sys = "^0.9.102" # Override to address WS-2023-0082, WS-2023-0083, WS-2023-0195
rustls-webpki = "0.102.2" # Override to address WS-2023-0305, CVE-2018-16875
rand = "^0.8.5"
serde = "^1.0.197"
serde_json = "^1.0.114"
serde = "^1.0.198"
serde_json = "^1.0.116"
thiserror = "^1.0.57"
tokenizers = "0.15.2"
tokio = { version = "1.36.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "fs"] }
tokio-rustls = "^0.25.0"
rustls = "0.22.2"
tokenizers = "0.19.1"
tokio = { version = "1.37.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "fs"] }
tokio-rustls = "^0.26.0"
rustls = "0.22.4"
tracing = "^0.1.40"
prost = "^0.12.3"
prost = "^0.12.4"
tonic = { version = "^0.11.0", features = ["tls"] }
tracing-subscriber = { version = "0.3.18", features = ["json", "env-filter"] }
tracing-opentelemetry = "0.23.0"
tokio-stream ="^0.1.14"
tokio-stream ="^0.1.15"
unicode-segmentation = "^1.11.0"
unicode-truncate = "^0.2.0"
opentelemetry = "0.22.0"
Expand Down
4 changes: 2 additions & 2 deletions router/client/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ build="build.rs"

[dependencies]
futures = "^0.3.30"
prost = "^0.12.3"
prost = "^0.12.4"
thiserror = "^1.0.58"
tokio = { version = "1.36.0", features = ["sync"] }
tokio = { version = "1.37.0", features = ["sync"] }
tonic = "^0.11.0"
tower = "^0.4.13"
tracing = "^0.1.40"
Expand Down
17 changes: 13 additions & 4 deletions router/src/batcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -839,10 +839,19 @@ impl<'a> TokenProcessor<'a> {
let request_id = output.request_id;
let next_token_id = output.token_id;

let e = self
.entries
.get_mut(&request_id)
.expect("ID not found. This is a bug.");
let e = self.entries.get_mut(&request_id);

// if a client cancelled a request and speculative decoding is
// enabled, it's possible that the request will get removed
// from entries table, but there can still be tokens in outputs stream
// corresponding to that request. ideally we could defer removing
// the request_id from the entries table until all tokens have been
// processed...but for now let's just ignore them.
if e.is_none() {
continue;
}

let e = e.unwrap();

let is_stream = e.stream_tx.is_some();
let stop_seqs = &e.request.parameters.stop_seqs;
Expand Down
2 changes: 1 addition & 1 deletion rust-toolchain.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[toolchain]
channel = "1.77"
channel = "1.77.2"
components = ["rustfmt", "clippy"]
2 changes: 1 addition & 1 deletion server/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ all: install run-dev
.PHONY: gen-server
gen-server:
# Compile protos
pip install grpcio-tools==1.60.0 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' --no-cache-dir
pip install grpcio-tools==1.62.2 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' --no-cache-dir
mkdir -p text_generation_server/pb
python -m grpc_tools.protoc -I../proto \
--python_out=text_generation_server/pb \
Expand Down
Loading

0 comments on commit 795c087

Please sign in to comment.