Skip to content

Commit

Permalink
Merge pull request opendatahub-io#379 from kserve/master
Browse files Browse the repository at this point in the history
[pull] master from kserve:master
  • Loading branch information
openshift-merge-bot[bot] authored Jun 19, 2024
2 parents 3065fd4 + 7f0f5e0 commit 4c52716
Show file tree
Hide file tree
Showing 13 changed files with 165 additions and 100 deletions.
2 changes: 1 addition & 1 deletion charts/kserve-resources/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ $ helm install kserve oci://ghcr.io/kserve/charts/kserve --version v0.13.0
| kserve.servingruntime.lgbserver.tag | string | `"v0.13.0"` | |
| kserve.servingruntime.mlserver.image | string | `"docker.io/seldonio/mlserver"` | |
| kserve.servingruntime.mlserver.modelClassPlaceholder | string | `"{{.Labels.modelClass}}"` | |
| kserve.servingruntime.mlserver.tag | string | `"1.3.2"` | |
| kserve.servingruntime.mlserver.tag | string | `"1.5.0"` | |
| kserve.servingruntime.modelNamePlaceholder | string | `"{{.Name}}"` | |
| kserve.servingruntime.paddleserver.image | string | `"kserve/paddleserver"` | |
| kserve.servingruntime.paddleserver.tag | string | `"v0.13.0"` | |
Expand Down
12 changes: 12 additions & 0 deletions charts/kserve-resources/templates/clusterservingruntimes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,26 @@ spec:
version: "1"
autoSelect: true
priority: 2
- name: xgboost
version: "2"
autoSelect: true
priority: 2
- name: lightgbm
version: "3"
autoSelect: true
priority: 2
- name: lightgbm
version: "4"
autoSelect: true
priority: 2
- name: mlflow
version: "1"
autoSelect: true
priority: 1
- name: mlflow
version: "2"
autoSelect: true
priority: 1
protocolVersions:
- v2
containers:
Expand Down
2 changes: 1 addition & 1 deletion charts/kserve-resources/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ kserve:
tag: 2.6.2
mlserver:
image: docker.io/seldonio/mlserver
tag: 1.3.2
tag: 1.5.0
modelClassPlaceholder: "{{.Labels.modelClass}}"
sklearnserver:
image: kserve/sklearnserver
Expand Down
12 changes: 12 additions & 0 deletions config/runtimes/kserve-mlserver.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,26 @@ spec:
version: "1"
autoSelect: true
priority: 2
- name: xgboost
version: "2"
autoSelect: true
priority: 2
- name: lightgbm
version: "3"
autoSelect: true
priority: 2
- name: lightgbm
version: "4"
autoSelect: true
priority: 2
- name: mlflow
version: "1"
autoSelect: true
priority: 1
- name: mlflow
version: "2"
autoSelect: true
priority: 1
protocolVersions:
- v2
containers:
Expand Down
2 changes: 1 addition & 1 deletion config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ images:

- name: mlserver
newName: docker.io/seldonio/mlserver
newTag: 1.3.2
newTag: 1.5.0

- name: kserve-xgbserver
newName: kserve/xgbserver
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/v1beta1/inferenceservice/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
// Abort early if the resolved deployment mode is Serverless, but Knative Services are not available
if deploymentMode == constants.Serverless {
ksvcAvailable, checkKsvcErr := utils.IsCrdAvailable(r.ClientConfig, knservingv1.SchemeGroupVersion.String(), constants.KnativeServiceKind)
if err != nil {
if checkKsvcErr != nil {
return reconcile.Result{}, checkKsvcErr
}

Expand Down
8 changes: 5 additions & 3 deletions python/huggingface_server.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG BASE_IMAGE=nvidia/cuda:12.1.0-devel-ubuntu22.04
ARG BASE_IMAGE=nvidia/cuda:12.4.1-devel-ubuntu22.04
ARG VENV_PATH=/prod_venv

FROM ${BASE_IMAGE} as builder
Expand All @@ -9,7 +9,7 @@ ARG POETRY_HOME=/opt/poetry
ARG POETRY_VERSION=1.7.1

# Install vllm
ARG VLLM_VERSION=0.4.2
ARG VLLM_VERSION=0.4.3

RUN apt-get update -y && apt-get install gcc python3.10-venv python3-dev -y && apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -34,7 +34,7 @@ RUN cd huggingfaceserver && poetry install --no-interaction --no-cache

RUN pip3 install vllm==${VLLM_VERSION}

FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as prod
FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 as prod

RUN apt-get update -y && apt-get install python3.10-venv -y && apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -58,6 +58,8 @@ ENV HF_HOME="/tmp/huggingface"
ENV SAFETENSORS_FAST_GPU="1"
# https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhubdisabletelemetry
ENV HF_HUB_DISABLE_TELEMETRY="1"
# NCCL Lib path for vLLM. https://github.com/vllm-project/vllm/blob/ec784b2526219cd96159a52074ab8cd4e684410a/vllm/utils.py#L598-L602
ENV VLLM_NCCL_SO_PATH="/lib/x86_64-linux-gnu/libnccl.so.2"

USER 1000
ENTRYPOINT ["python3", "-m", "huggingfaceserver"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,9 @@ async def create_completion(self, completion_request: CompletionRequest):

generators.append(
self.engine.generate(
prompt,
{"prompt": prompt, "prompt_token_ids": input_ids},
sampling_params,
f"{request_id}-{i}",
prompt_token_ids=input_ids,
)
)
except Exception as e:
Expand Down Expand Up @@ -175,7 +174,7 @@ async def create_completion(self, completion_request: CompletionRequest):
)

# Non-streaming response
final_res_batch: RequestOutput = [None] * len(prompts)
final_res_batch: List[RequestOutput] = [None] * len(prompts)
try:
async for i, res in result_generator:
final_res_batch[i] = res
Expand Down
47 changes: 27 additions & 20 deletions python/huggingfaceserver/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion python/huggingfaceserver/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ kserve = { path = "../kserve", extras = ["storage"], develop = true }
transformers = "~4.40.2"
accelerate = "~0.30.0"
torch = "~2.3.0"
vllm = { version = "^0.4.2", optional = true }
vllm = { version = "^0.4.3", optional = true }

[tool.poetry.extras]
vllm = [
Expand Down
44 changes: 28 additions & 16 deletions python/kserve/kserve/protocol/rest/openai/openai_proxy_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,8 @@ async def create_completion(
self, request: CompletionRequest
) -> Union[Completion, AsyncIterator[Completion]]:
self.preprocess_completion_request(request)
req = self._build_request(self._completions_endpoint, request)
if request.params.stream:
req = self._build_request(self._completions_endpoint, request)
r = await self._http_client.send(req, stream=True)
r.raise_for_status()
it = AsyncMappingIterator(
Expand All @@ -254,23 +254,28 @@ async def create_completion(
)
return it
else:
response = await self._http_client.send(req)
response.raise_for_status()
if self.skip_upstream_validation:
obj = response.json()
completion = Completion.model_construct(**obj)
else:
completion = Completion.model_validate_json(response.content)
completion = await self.generate_completion(request)
self.postprocess_completion(completion, request)
return completion

async def generate_completion(self, request: CompletionRequest) -> Completion:
req = self._build_request(self._completions_endpoint, request)
response = await self._http_client.send(req)
response.raise_for_status()
if self.skip_upstream_validation:
obj = response.json()
completion = Completion.model_construct(**obj)
else:
completion = Completion.model_validate_json(response.content)
return completion

@error_handler
async def create_chat_completion(
self, request: ChatCompletionRequest
) -> Union[ChatCompletion, AsyncIterator[ChatCompletionChunk]]:
self.preprocess_chat_completion_request(request)
req = self._build_request(self._chat_completions_endpoint, request)
if request.params.stream:
req = self._build_request(self._chat_completions_endpoint, request)
r = await self._http_client.send(req, stream=True)
r.raise_for_status()
it = AsyncMappingIterator(
Expand All @@ -280,12 +285,19 @@ async def create_chat_completion(
)
return it
else:
response = await self._http_client.send(req)
response.raise_for_status()
if self.skip_upstream_validation:
obj = response.json()
chat_completion = ChatCompletion.model_construct(**obj)
else:
chat_completion = ChatCompletion.model_validate_json(response.content)
chat_completion = await self.generate_chat_completion(request)
self.postprocess_chat_completion(chat_completion, request)
return chat_completion

async def generate_chat_completion(
self, request: ChatCompletionRequest
) -> ChatCompletion:
req = self._build_request(self._chat_completions_endpoint, request)
response = await self._http_client.send(req)
response.raise_for_status()
if self.skip_upstream_validation:
obj = response.json()
chat_completion = ChatCompletion.model_construct(**obj)
else:
chat_completion = ChatCompletion.model_validate_json(response.content)
return chat_completion
Loading

0 comments on commit 4c52716

Please sign in to comment.