-
Notifications
You must be signed in to change notification settings - Fork 121
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add ollama docs Signed-off-by: Samhita Alla <[email protected]> * update doc Signed-off-by: Samhita Alla <[email protected]> * update index Signed-off-by: Samhita Alla <[email protected]> * add system prompt as task input Signed-off-by: Samhita Alla <[email protected]> * add user prompt Signed-off-by: Samhita Alla <[email protected]> * set gpu to 1 in ollama initialization Signed-off-by: Samhita Alla <[email protected]> * remove modelfile Signed-off-by: Samhita Alla <[email protected]> * bump version Signed-off-by: Samhita Alla <[email protected]> * fix code Signed-off-by: Samhita Alla <[email protected]> --------- Signed-off-by: Samhita Alla <[email protected]>
- Loading branch information
1 parent
1b4d46d
commit a94aa97
Showing
8 changed files
with
163 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# ###################### | ||
# NOTE: For CI/CD only # | ||
######################## | ||
FROM python:3.11-slim-buster | ||
LABEL org.opencontainers.image.source=https://github.com/flyteorg/flytesnacks | ||
|
||
WORKDIR /root | ||
ENV VENV /opt/venv | ||
ENV LANG C.UTF-8 | ||
ENV LC_ALL C.UTF-8 | ||
ENV PYTHONPATH /root | ||
|
||
# Install Python dependencies | ||
COPY requirements.in /root | ||
RUN pip install -r /root/requirements.in | ||
|
||
# Copy the actual code | ||
COPY . /root/ | ||
|
||
# This tag is supplied by the build script and will be used to determine the version | ||
# when registering tasks, workflows, and launch plans | ||
ARG tag | ||
ENV FLYTE_INTERNAL_IMAGE $tag |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
(ollama_plugin)= | ||
|
||
# Ollama | ||
|
||
```{eval-rst} | ||
.. tags:: Inference, LLM | ||
``` | ||
|
||
Serve large language models (LLMs) in a Flyte task. | ||
|
||
[Ollama](https://ollama.com/) simplifies the process of serving fine-tuned LLMs. | ||
Whether you're generating predictions from a customized model or deploying it across different hardware setups, | ||
Ollama enables you to encapsulate the entire workflow in a single pipeline. | ||
|
||
## Installation | ||
|
||
To use the Ollama plugin, run the following command: | ||
|
||
``` | ||
pip install flytekitplugins-inference | ||
``` | ||
|
||
## Example usage | ||
|
||
For a usage example, see {doc}`Ollama example usage <serve_llm>`. | ||
|
||
```{note} | ||
Ollama can only be run in a Flyte cluster as it must be deployed as a sidecar service in a Kubernetes pod. | ||
``` | ||
|
||
```{toctree} | ||
:maxdepth: -1 | ||
:hidden: | ||
serve_llm | ||
``` |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# %% [markdown] | ||
# (serve_llm)= | ||
# | ||
# # Serve LLMs with Ollama | ||
# | ||
# In this guide, you'll learn how to locally serve Gemma2 and fine-tuned Llama3 models using Ollama within a Flyte task. | ||
# | ||
# Start by importing Ollama from the `flytekitplugins.inference` package and specifying the desired model name. | ||
# | ||
# Below is a straightforward example of serving a Gemma2 model: | ||
# %% | ||
from flytekit import ImageSpec, Resources, task | ||
from flytekit.extras.accelerators import A10G | ||
from flytekitplugins.inference import Model, Ollama | ||
from openai import OpenAI | ||
|
||
image = ImageSpec( | ||
name="ollama_serve", | ||
registry="ghcr.io/flyteorg", | ||
packages=["flytekitplugins-inference"], | ||
builder="default", | ||
) | ||
|
||
ollama_instance = Ollama(model=Model(name="gemma2"), gpu="1") | ||
|
||
|
||
@task( | ||
container_image=image, | ||
pod_template=ollama_instance.pod_template, | ||
accelerator=A10G, | ||
requests=Resources(gpu="0"), | ||
) | ||
def model_serving(user_prompt: str) -> str: | ||
client = OpenAI(base_url=f"{ollama_instance.base_url}/v1", api_key="ollama") # api key required but ignored | ||
|
||
completion = client.chat.completions.create( | ||
model="gemma2", | ||
messages=[ | ||
{ | ||
"role": "user", | ||
"content": user_prompt, | ||
} | ||
], | ||
temperature=0.5, | ||
top_p=1, | ||
max_tokens=1024, | ||
) | ||
|
||
return completion.choices[0].message.content | ||
|
||
|
||
# %% [markdown] | ||
# :::{important} | ||
# Replace `ghcr.io/flyteorg` with a container registry to which you can publish. | ||
# To upload the image to the local registry in the demo cluster, indicate the registry as `localhost:30000`. | ||
# ::: | ||
# | ||
# The `model_serving` task initiates a sidecar service to serve the model, making it accessible on localhost via the `base_url` property. | ||
# You can use either the chat or chat completion endpoints. | ||
# | ||
# By default, Ollama initializes the server with `cpu`, `gpu`, and `mem` set to `1`, `1`, and `15Gi`, respectively. | ||
# You can adjust these settings to meet your requirements. | ||
# | ||
# To serve a fine-tuned model, provide the model configuration as `modelfile` within the `Model` dataclass. | ||
# | ||
# Below is an example of specifying a fine-tuned LoRA adapter for a Llama3 Mario model: | ||
# %% | ||
from flytekit.types.file import FlyteFile | ||
|
||
finetuned_ollama_instance = Ollama( | ||
model=Model( | ||
name="llama3-mario", | ||
modelfile="FROM llama3\nADAPTER {inputs.ggml}\nPARAMETER temperature 1\nPARAMETER num_ctx 4096\nSYSTEM {inputs.system_prompt}", | ||
), | ||
gpu="1", | ||
) | ||
|
||
|
||
@task( | ||
container_image=image, | ||
pod_template=finetuned_ollama_instance.pod_template, | ||
accelerator=A10G, | ||
requests=Resources(gpu="0"), | ||
) | ||
def finetuned_model_serving(ggml: FlyteFile, system_prompt: str): | ||
... | ||
|
||
|
||
# %% [markdown] | ||
# `{inputs.ggml}` and `{inputs.system_prompt}` are materialized at run time, with `ggml` and `system_prompt` available as inputs to the task. | ||
# | ||
# Ollama models can be integrated into different stages of your AI workflow, including data pre-processing, | ||
# model inference, and post-processing. Flyte also allows serving multiple Ollama models simultaneously | ||
# on various instances. | ||
# | ||
# This integration enables you to self-host and serve AI models on your own infrastructure, | ||
# ensuring full control over costs and data security. | ||
# | ||
# For more detailed information on the models natively supported by Ollama, visit the [Ollama models library](https://ollama.com/library). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
flytekitplugins-inference>=1.13.6b1 |