Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[InferenceClient] Add support for adapter_id (text-generation) and response_format (chat-completion) #2383

Merged
merged 12 commits into from
Jul 16, 2024
10 changes: 8 additions & 2 deletions docs/source/en/package_reference/inference_types.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,20 @@ This part of the lib is still under development and will be improved in future r

[[autodoc]] huggingface_hub.ChatCompletionInputFunctionDefinition

[[autodoc]] huggingface_hub.ChatCompletionInputFunctionName

[[autodoc]] huggingface_hub.ChatCompletionInputGrammarType

[[autodoc]] huggingface_hub.ChatCompletionInputMessage

[[autodoc]] huggingface_hub.ChatCompletionInputTool
[[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk

[[autodoc]] huggingface_hub.ChatCompletionInputToolCall
[[autodoc]] huggingface_hub.ChatCompletionInputTool

[[autodoc]] huggingface_hub.ChatCompletionInputToolTypeClass

[[autodoc]] huggingface_hub.ChatCompletionInputURL

[[autodoc]] huggingface_hub.ChatCompletionOutput

[[autodoc]] huggingface_hub.ChatCompletionOutputComplete
Expand Down
10 changes: 8 additions & 2 deletions docs/source/ko/package_reference/inference_types.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,20 @@ rendered properly in your Markdown viewer.

[[autodoc]] huggingface_hub.ChatCompletionInputFunctionDefinition

[[autodoc]] huggingface_hub.ChatCompletionInputFunctionName

[[autodoc]] huggingface_hub.ChatCompletionInputGrammarType

[[autodoc]] huggingface_hub.ChatCompletionInputMessage

[[autodoc]] huggingface_hub.ChatCompletionInputTool
[[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk

[[autodoc]] huggingface_hub.ChatCompletionInputToolCall
[[autodoc]] huggingface_hub.ChatCompletionInputTool

[[autodoc]] huggingface_hub.ChatCompletionInputToolTypeClass

[[autodoc]] huggingface_hub.ChatCompletionInputURL

[[autodoc]] huggingface_hub.ChatCompletionOutput

[[autodoc]] huggingface_hub.ChatCompletionOutputComplete
Expand Down
10 changes: 8 additions & 2 deletions src/huggingface_hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,13 @@
"AutomaticSpeechRecognitionParameters",
"ChatCompletionInput",
"ChatCompletionInputFunctionDefinition",
"ChatCompletionInputFunctionName",
"ChatCompletionInputGrammarType",
"ChatCompletionInputMessage",
"ChatCompletionInputMessageChunk",
"ChatCompletionInputTool",
"ChatCompletionInputToolCall",
"ChatCompletionInputToolTypeClass",
"ChatCompletionInputURL",
"ChatCompletionOutput",
"ChatCompletionOutputComplete",
"ChatCompletionOutputFunctionDefinition",
Expand Down Expand Up @@ -772,10 +775,13 @@ def __dir__():
AutomaticSpeechRecognitionParameters, # noqa: F401
ChatCompletionInput, # noqa: F401
ChatCompletionInputFunctionDefinition, # noqa: F401
ChatCompletionInputFunctionName, # noqa: F401
ChatCompletionInputGrammarType, # noqa: F401
ChatCompletionInputMessage, # noqa: F401
ChatCompletionInputMessageChunk, # noqa: F401
ChatCompletionInputTool, # noqa: F401
ChatCompletionInputToolCall, # noqa: F401
ChatCompletionInputToolTypeClass, # noqa: F401
ChatCompletionInputURL, # noqa: F401
ChatCompletionOutput, # noqa: F401
ChatCompletionOutputComplete, # noqa: F401
ChatCompletionOutputFunctionDefinition, # noqa: F401
Expand Down
76 changes: 68 additions & 8 deletions src/huggingface_hub/inference/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
AudioClassificationOutputElement,
AudioToAudioOutputElement,
AutomaticSpeechRecognitionOutput,
ChatCompletionInputGrammarType,
ChatCompletionInputTool,
ChatCompletionInputToolTypeClass,
ChatCompletionOutput,
Expand All @@ -103,7 +104,6 @@
ZeroShotClassificationOutputElement,
ZeroShotImageClassificationOutputElement,
)
from huggingface_hub.inference._generated.types.chat_completion import ChatCompletionInputToolTypeEnum
from huggingface_hub.inference._types import (
ConversationalOutput, # soon to be removed
)
Expand Down Expand Up @@ -433,10 +433,11 @@ def chat_completion( # type: ignore
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[ChatCompletionInputGrammarType] = None,
seed: Optional[int] = None,
stop: Optional[List[str]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
tool_prompt: Optional[str] = None,
tools: Optional[List[ChatCompletionInputTool]] = None,
top_logprobs: Optional[int] = None,
Expand All @@ -457,10 +458,11 @@ def chat_completion( # type: ignore
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[ChatCompletionInputGrammarType] = None,
seed: Optional[int] = None,
stop: Optional[List[str]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
tool_prompt: Optional[str] = None,
tools: Optional[List[ChatCompletionInputTool]] = None,
top_logprobs: Optional[int] = None,
Expand All @@ -481,10 +483,11 @@ def chat_completion(
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[ChatCompletionInputGrammarType] = None,
seed: Optional[int] = None,
stop: Optional[List[str]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
tool_prompt: Optional[str] = None,
tools: Optional[List[ChatCompletionInputTool]] = None,
top_logprobs: Optional[int] = None,
Expand All @@ -505,10 +508,11 @@ def chat_completion(
max_tokens: Optional[int] = None,
n: Optional[int] = None,
presence_penalty: Optional[float] = None,
response_format: Optional[ChatCompletionInputGrammarType] = None,
seed: Optional[int] = None,
stop: Optional[List[str]] = None,
temperature: Optional[float] = None,
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, ChatCompletionInputToolTypeEnum]] = None,
tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
tool_prompt: Optional[str] = None,
tools: Optional[List[ChatCompletionInputTool]] = None,
top_logprobs: Optional[int] = None,
Expand Down Expand Up @@ -544,6 +548,8 @@ def chat_completion(
presence_penalty (`float`, *optional*):
Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the
text so far, increasing the model's likelihood to talk about new topics.
response_format ([`ChatCompletionInputGrammarType`], *optional*):
Grammar constraints. Can be either a JSONSchema or a regex.
seed (Optional[`int`], *optional*):
Seed for reproducible control flow. Defaults to None.
stop (Optional[`str`], *optional*):
Expand All @@ -561,7 +567,7 @@ def chat_completion(
top_p (`float`, *optional*):
Fraction of the most likely next words to sample from.
Must be between 0 and 1. Defaults to 1.0.
tool_choice ([`ChatCompletionInputToolTypeClass`] or [`ChatCompletionInputToolTypeEnum`], *optional*):
tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
The tool to use for the completion. Defaults to "auto".
tool_prompt (`str`, *optional*):
A prompt to be appended before the tools.
Expand All @@ -588,7 +594,6 @@ def chat_completion(
Example:

```py
# Chat example
>>> from huggingface_hub import InferenceClient
>>> messages = [{"role": "user", "content": "What is the capital of France?"}]
>>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
Expand Down Expand Up @@ -618,15 +623,23 @@ def chat_completion(
total_tokens=25
)
)
```

Example (stream=True):
```py
>>> from huggingface_hub import InferenceClient
>>> messages = [{"role": "user", "content": "What is the capital of France?"}]
>>> client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
>>> for token in client.chat_completion(messages, max_tokens=10, stream=True):
... print(token)
ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content='The', role='assistant'), index=0, finish_reason=None)], created=1710498504)
ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' capital', role='assistant'), index=0, finish_reason=None)], created=1710498504)
(...)
ChatCompletionStreamOutput(choices=[ChatCompletionStreamOutputChoice(delta=ChatCompletionStreamOutputDelta(content=' may', role='assistant'), index=0, finish_reason=None)], created=1710498504)
```

# Chat example with tools
Example using tools:
```py
>>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
>>> messages = [
... {
Expand Down Expand Up @@ -707,6 +720,37 @@ def chat_completion(
description=None
)
```

Example using response_format:
```py
>>> from huggingface_hub import InferenceClient
>>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
>>> messages = [
... {
... "role": "user",
... "content": "I saw a puppy a cat and a raccoon during my bike ride in the park. What did I saw and when?",
... },
... ]
>>> response_format = {
... "type": "json",
... "value": {
... "properties": {
... "location": {"type": "string"},
... "activity": {"type": "string"},
... "animals_seen": {"type": "integer", "minimum": 1, "maximum": 5},
... "animals": {"type": "array", "items": {"type": "string"}},
... },
... "required": ["location", "activity", "animals_seen", "animals"],
... },
... }
>>> response = client.chat_completion(
... messages=messages,
... response_format=response_format,
... max_tokens=500,
)
>>> response.choices[0].message.content
'{\n\n"activity": "bike ride",\n"animals": ["puppy", "cat", "raccoon"],\n"animals_seen": 3,\n"location": "park"}'
```
"""
# determine model
model = model or self.model or self.get_recommended_model("text-generation")
Expand Down Expand Up @@ -739,6 +783,7 @@ def chat_completion(
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
temperature=temperature,
Expand Down Expand Up @@ -790,6 +835,11 @@ def chat_completion(
"Tools are not supported by the model. This is due to the model not been served by a "
"Text-Generation-Inference server. The provided tool parameters will be ignored."
)
if response_format is not None:
warnings.warn(
"Response format is not supported by the model. This is due to the model not been served by a "
"Text-Generation-Inference server. The provided response format will be ignored."
)

# generate response
text_generation_output = self.text_generation(
Expand Down Expand Up @@ -1677,6 +1727,7 @@ def text_generation( # type: ignore
stream: Literal[False] = ...,
model: Optional[str] = None,
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
adapter_id: Optional[str] = None,
best_of: Optional[int] = None,
decoder_input_details: Optional[bool] = None,
do_sample: Optional[bool] = False, # Manual default value
Expand Down Expand Up @@ -1705,6 +1756,7 @@ def text_generation( # type: ignore
stream: Literal[False] = ...,
model: Optional[str] = None,
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
adapter_id: Optional[str] = None,
best_of: Optional[int] = None,
decoder_input_details: Optional[bool] = None,
do_sample: Optional[bool] = False, # Manual default value
Expand Down Expand Up @@ -1733,6 +1785,7 @@ def text_generation( # type: ignore
stream: Literal[True] = ...,
model: Optional[str] = None,
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
adapter_id: Optional[str] = None,
best_of: Optional[int] = None,
decoder_input_details: Optional[bool] = None,
do_sample: Optional[bool] = False, # Manual default value
Expand Down Expand Up @@ -1761,6 +1814,7 @@ def text_generation( # type: ignore
stream: Literal[True] = ...,
model: Optional[str] = None,
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
adapter_id: Optional[str] = None,
best_of: Optional[int] = None,
decoder_input_details: Optional[bool] = None,
do_sample: Optional[bool] = False, # Manual default value
Expand Down Expand Up @@ -1789,6 +1843,7 @@ def text_generation(
stream: bool = ...,
model: Optional[str] = None,
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
adapter_id: Optional[str] = None,
best_of: Optional[int] = None,
decoder_input_details: Optional[bool] = None,
do_sample: Optional[bool] = False, # Manual default value
Expand Down Expand Up @@ -1816,6 +1871,7 @@ def text_generation(
stream: bool = False,
model: Optional[str] = None,
# Parameters from `TextGenerationInputGenerateParameters` (maintained manually)
adapter_id: Optional[str] = None,
best_of: Optional[int] = None,
decoder_input_details: Optional[bool] = None,
do_sample: Optional[bool] = False, # Manual default value
Expand Down Expand Up @@ -1867,6 +1923,8 @@ def text_generation(
model (`str`, *optional*):
The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
adapter_id (`str`, *optional*):
Lora adapter id.
best_of (`int`, *optional*):
Generate best_of sequences and return the one if the highest token logprobs.
decoder_input_details (`bool`, *optional*):
Expand Down Expand Up @@ -2035,6 +2093,7 @@ def text_generation(

# Build payload
parameters = {
"adapter_id": adapter_id,
"best_of": best_of,
"decoder_input_details": decoder_input_details,
"details": details,
Expand Down Expand Up @@ -2105,6 +2164,7 @@ def text_generation(
details=details,
stream=stream,
model=model,
adapter_id=adapter_id,
best_of=best_of,
decoder_input_details=decoder_input_details,
do_sample=do_sample,
Expand Down
2 changes: 0 additions & 2 deletions src/huggingface_hub/inference/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,6 @@ def _format_chat_completion_stream_output_from_text_generation(
# explicitly set 'dummy' values to reduce expectations from users
id="dummy",
model="dummy",
object="dummy",
system_fingerprint="dummy",
choices=[
ChatCompletionStreamOutputChoice(
Expand All @@ -335,7 +334,6 @@ def _format_chat_completion_stream_output_from_text_generation(
# explicitly set 'dummy' values to reduce expectations from users
id="dummy",
model="dummy",
object="dummy",
system_fingerprint="dummy",
choices=[
ChatCompletionStreamOutputChoice(
Expand Down
Loading
Loading