diff --git a/python/mypy.ini b/python/mypy.ini index 9505beba81df..30d9947c2100 100644 --- a/python/mypy.ini +++ b/python/mypy.ini @@ -21,10 +21,6 @@ ignore_errors = true ignore_errors = true # TODO (eavanvalkenburg): remove this: https://github.com/microsoft/semantic-kernel/issues/7132 -[mypy-semantic_kernel.connectors.ai.hugging_face.*] -ignore_errors = true -# TODO (eavanvalkenburg): remove this: https://github.com/microsoft/semantic-kernel/issues/7133 - [mypy-semantic_kernel.connectors.ai.ollama.*] ignore_errors = true # TODO (eavanvalkenburg): remove this: https://github.com/microsoft/semantic-kernel/issues/7134 diff --git a/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_completion.py b/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_completion.py index 05465ef607a6..61dd1554ec9d 100644 --- a/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_completion.py +++ b/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_completion.py @@ -1,22 +1,26 @@ # Copyright (c) Microsoft. All rights reserved. import logging +import sys from collections.abc import AsyncGenerator from threading import Thread -from typing import TYPE_CHECKING, Any, Literal +from typing import Any, Literal + +if sys.version_info >= (3, 12): + from typing import override # pragma: no cover +else: + from typing_extensions import override # pragma: no cover import torch from transformers import AutoTokenizer, TextIteratorStreamer, pipeline from semantic_kernel.connectors.ai.hugging_face.hf_prompt_execution_settings import HuggingFacePromptExecutionSettings +from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.connectors.ai.text_completion_client_base import TextCompletionClientBase from semantic_kernel.contents.streaming_text_content import StreamingTextContent from semantic_kernel.contents.text_content import TextContent from semantic_kernel.exceptions import ServiceInvalidExecutionSettingsError, ServiceResponseException -if TYPE_CHECKING: - from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings - logger: logging.Logger = logging.getLogger(__name__) @@ -29,7 +33,7 @@ def __init__( self, ai_model_id: str, task: str | None = "text2text-generation", - device: int | None = -1, + device: int = -1, service_id: str | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_kwargs: dict[str, Any] | None = None, @@ -39,22 +43,21 @@ def __init__( Args: ai_model_id (str): Hugging Face model card string, see https://huggingface.co/models - device (Optional[int]): Device to run the model on, defaults to CPU, 0+ for GPU, - -- None if using device_map instead. (If both device and device_map - are specified, device overrides device_map. If unintended, - it can lead to unexpected behavior.) - service_id (Optional[str]): Service ID for the AI service. - task (Optional[str]): Model completion task type, options are: + device (int): Device to run the model on, defaults to CPU, 0+ for GPU, + -- None if using device_map instead. (If both device and device_map + are specified, device overrides device_map. If unintended, + it can lead to unexpected behavior.) (optional) + service_id (str): Service ID for the AI service. (optional) + task (str): Model completion task type, options are: - summarization: takes a long text and returns a shorter summary. - text-generation: takes incomplete text and returns a set of completion candidates. - text2text-generation (default): takes an input prompt and returns a completion. - text2text-generation is the default as it behaves more like GPT-3+. - log : Logger instance. (Deprecated) - model_kwargs (Optional[Dict[str, Any]]): Additional dictionary of keyword arguments - passed along to the model's `from_pretrained(..., **model_kwargs)` function. - pipeline_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments passed along + text2text-generation is the default as it behaves more like GPT-3+. (optional) + model_kwargs (dict[str, Any]): Additional dictionary of keyword arguments + passed along to the model's `from_pretrained(..., **model_kwargs)` function. (optional) + pipeline_kwargs (dict[str, Any]): Additional keyword arguments passed along to the specific pipeline init (see the documentation for the corresponding pipeline class - for possible values). + for possible values). (optional) Note that this model will be downloaded from the Hugging Face model hub. """ @@ -65,18 +68,19 @@ def __init__( model_kwargs=model_kwargs, **pipeline_kwargs or {}, ) + resolved_device = f"cuda:{device}" if device >= 0 and torch.cuda.is_available() else "cpu" super().__init__( service_id=service_id, ai_model_id=ai_model_id, task=task, - device=(f"cuda:{device}" if device >= 0 and torch.cuda.is_available() else "cpu"), + device=resolved_device, generator=generator, ) async def get_text_contents( self, prompt: str, - settings: HuggingFacePromptExecutionSettings, + settings: PromptExecutionSettings, ) -> list[TextContent]: """This is the method that is called from the kernel to get a response from a text-optimized LLM. @@ -87,10 +91,14 @@ async def get_text_contents( Returns: List[TextContent]: A list of TextContent objects representing the response(s) from the LLM. """ + if not isinstance(settings, HuggingFacePromptExecutionSettings): + settings = self.get_prompt_execution_settings_from_settings(settings) + assert isinstance(settings, HuggingFacePromptExecutionSettings) # nosec + try: results = self.generator(prompt, **settings.prepare_settings_dict()) except Exception as e: - raise ServiceResponseException("Hugging Face completion failed", e) from e + raise ServiceResponseException("Hugging Face completion failed") from e if isinstance(results, list): return [self._create_text_content(results, result) for result in results] return [self._create_text_content(results, results)] @@ -105,7 +113,7 @@ def _create_text_content(self, response: Any, candidate: dict[str, str]) -> Text async def get_streaming_text_contents( self, prompt: str, - settings: HuggingFacePromptExecutionSettings, + settings: PromptExecutionSettings, ) -> AsyncGenerator[list[StreamingTextContent], Any]: """Streams a text completion using a Hugging Face model. @@ -118,6 +126,10 @@ async def get_streaming_text_contents( Yields: List[StreamingTextContent]: List of StreamingTextContent objects. """ + if not isinstance(settings, HuggingFacePromptExecutionSettings): + settings = self.get_prompt_execution_settings_from_settings(settings) + assert isinstance(settings, HuggingFacePromptExecutionSettings) # nosec + if settings.num_return_sequences > 1: raise ServiceInvalidExecutionSettingsError( "HuggingFace TextIteratorStreamer does not stream multiple responses in a parseable format. \ @@ -139,10 +151,10 @@ async def get_streaming_text_contents( ] thread.join() - except Exception as e: - raise ServiceResponseException("Hugging Face completion failed", e) from e + raise ServiceResponseException("Hugging Face completion failed") from e - def get_prompt_execution_settings_class(self) -> "PromptExecutionSettings": + @override + def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]: """Create a request settings object.""" return HuggingFacePromptExecutionSettings diff --git a/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_embedding.py b/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_embedding.py index fd54c14d7e4f..057ec5be46dd 100644 --- a/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_embedding.py +++ b/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_embedding.py @@ -5,9 +5,9 @@ from typing import Any if sys.version_info >= (3, 12): - from typing import override + from typing import override # pragma: no cover else: - from typing_extensions import override + from typing_extensions import override # pragma: no cover import sentence_transformers import torch @@ -28,7 +28,7 @@ class HuggingFaceTextEmbedding(EmbeddingGeneratorBase): def __init__( self, ai_model_id: str, - device: int | None = -1, + device: int = -1, service_id: str | None = None, ) -> None: """Initializes a new instance of the HuggingFaceTextEmbedding class. @@ -36,8 +36,8 @@ def __init__( Args: ai_model_id (str): Hugging Face model card string, see https://huggingface.co/sentence-transformers - device (Optional[int]): Device to run the model on, -1 for CPU, 0+ for GPU. - service_id (Optional[str]): Service ID for the model. + device (int): Device to run the model on, -1 for CPU, 0+ for GPU. (optional) + service_id (str): Service ID for the model. (optional) Note that this model will be downloaded from the Hugging Face model hub. """ diff --git a/python/tests/unit/connectors/hugging_face/test_hf_text_completions.py b/python/tests/unit/connectors/hugging_face/test_hf_text_completions.py index 4dd4959d0755..96099d8cf5b8 100644 --- a/python/tests/unit/connectors/hugging_face/test_hf_text_completions.py +++ b/python/tests/unit/connectors/hugging_face/test_hf_text_completions.py @@ -1,11 +1,14 @@ # Copyright (c) Microsoft. All rights reserved. -from unittest.mock import Mock, patch +from threading import Thread +from unittest.mock import MagicMock, Mock, patch import pytest +from transformers import TextIteratorStreamer from semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion import HuggingFaceTextCompletion from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.exceptions import KernelInvokeException, ServiceResponseException from semantic_kernel.functions.kernel_arguments import KernelArguments from semantic_kernel.kernel import Kernel from semantic_kernel.prompt_template.prompt_template_config import PromptTemplateConfig @@ -46,8 +49,9 @@ async def test_text_completion(model_name, task, input_str): # Configure LLM service with patch("semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion.pipeline") as patched_pipeline: patched_pipeline.return_value = mock_pipeline + service = HuggingFaceTextCompletion(service_id=model_name, ai_model_id=model_name, task=task) kernel.add_service( - service=HuggingFaceTextCompletion(service_id=model_name, ai_model_id=model_name, task=task), + service=service, ) exec_settings = PromptExecutionSettings(service_id=model_name, extension_data={"max_new_tokens": 25}) @@ -68,3 +72,148 @@ async def test_text_completion(model_name, task, input_str): await kernel.invoke(function_name="TestFunction", plugin_name="TestPlugin", arguments=arguments) assert mock_pipeline.call_args.args[0] == input_str + + +@pytest.mark.asyncio +async def test_text_completion_throws(): + kernel = Kernel() + + model_name = "patrickvonplaten/t5-tiny-random" + task = "text2text-generation" + input_str = "translate English to Dutch: Hello, how are you?" + + with patch("semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion.pipeline") as patched_pipeline: + mock_generator = Mock() + mock_generator.side_effect = Exception("Test exception") + patched_pipeline.return_value = mock_generator + service = HuggingFaceTextCompletion(service_id=model_name, ai_model_id=model_name, task=task) + kernel.add_service(service=service) + + exec_settings = PromptExecutionSettings(service_id=model_name, extension_data={"max_new_tokens": 25}) + + prompt = "{{$input}}" + prompt_template_config = PromptTemplateConfig(template=prompt, execution_settings=exec_settings) + + kernel.add_function( + prompt_template_config=prompt_template_config, + function_name="TestFunction", + plugin_name="TestPlugin", + prompt_execution_settings=exec_settings, + ) + + arguments = KernelArguments(input=input_str) + + with pytest.raises( + KernelInvokeException, match="Error occurred while invoking function: 'TestPlugin-TestFunction'" + ): + await kernel.invoke(function_name="TestFunction", plugin_name="TestPlugin", arguments=arguments) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("model_name", "task", "input_str"), + [ + ( + "patrickvonplaten/t5-tiny-random", + "text2text-generation", + "translate English to Dutch: Hello, how are you?", + ), + ("HuggingFaceM4/tiny-random-LlamaForCausalLM", "text-generation", "Hello, I like sleeping and "), + ], + ids=["text2text-generation", "text-generation"], +) +async def test_text_completion_streaming(model_name, task, input_str): + ret = {"summary_text": "test"} if task == "summarization" else {"generated_text": "test"} + mock_pipeline = Mock(return_value=ret) + + mock_streamer = MagicMock(spec=TextIteratorStreamer) + mock_streamer.__iter__.return_value = iter(["mocked_text"]) + + with ( + patch( + "semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion.pipeline", + return_value=mock_pipeline, + ), + patch( + "semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion.Thread", + side_effect=Mock(spec=Thread), + ), + patch( + "semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion.TextIteratorStreamer", + return_value=mock_streamer, + ) as mock_stream, + ): + mock_stream.return_value = mock_streamer + service = HuggingFaceTextCompletion(service_id=model_name, ai_model_id=model_name, task=task) + prompt = "test prompt" + exec_settings = PromptExecutionSettings(service_id=model_name, extension_data={"max_new_tokens": 25}) + + result = [] + async for content in service.get_streaming_text_contents(prompt, exec_settings): + result.append(content) + + assert len(result) == 1 + assert result[0][0].inner_content == "mocked_text" + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("model_name", "task", "input_str"), + [ + ( + "patrickvonplaten/t5-tiny-random", + "text2text-generation", + "translate English to Dutch: Hello, how are you?", + ), + ("HuggingFaceM4/tiny-random-LlamaForCausalLM", "text-generation", "Hello, I like sleeping and "), + ], + ids=["text2text-generation", "text-generation"], +) +async def test_text_completion_streaming_throws(model_name, task, input_str): + ret = {"summary_text": "test"} if task == "summarization" else {"generated_text": "test"} + mock_pipeline = Mock(return_value=ret) + + mock_streamer = MagicMock(spec=TextIteratorStreamer) + mock_streamer.__iter__.return_value = Exception() + + with ( + patch( + "semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion.pipeline", + return_value=mock_pipeline, + ), + patch( + "semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion.Thread", + side_effect=Exception(), + ), + patch( + "semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion.TextIteratorStreamer", + return_value=mock_streamer, + ) as mock_stream, + ): + mock_stream.return_value = mock_streamer + service = HuggingFaceTextCompletion(service_id=model_name, ai_model_id=model_name, task=task) + prompt = "test prompt" + exec_settings = PromptExecutionSettings(service_id=model_name, extension_data={"max_new_tokens": 25}) + + with pytest.raises(ServiceResponseException, match=("Hugging Face completion failed")): + async for _ in service.get_streaming_text_contents(prompt, exec_settings): + pass + + +def test_hugging_face_text_completion_init(): + with ( + patch("semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion.pipeline") as patched_pipeline, + patch( + "semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion.torch.cuda.is_available" + ) as mock_torch_cuda_is_available, + ): + patched_pipeline.return_value = patched_pipeline + mock_torch_cuda_is_available.return_value = False + + ai_model_id = "test-model" + task = "summarization" + device = -1 + + service = HuggingFaceTextCompletion(service_id="test", ai_model_id=ai_model_id, task=task, device=device) + + assert service is not None diff --git a/python/tests/unit/connectors/hugging_face/test_hf_text_embedding.py b/python/tests/unit/connectors/hugging_face/test_hf_text_embedding.py new file mode 100644 index 000000000000..ea4c4b6f7a7a --- /dev/null +++ b/python/tests/unit/connectors/hugging_face/test_hf_text_embedding.py @@ -0,0 +1,66 @@ +# Copyright (c) Microsoft. All rights reserved. + +from unittest.mock import patch + +import pytest +from numpy import array, ndarray + +from semantic_kernel.connectors.ai.hugging_face.services.hf_text_embedding import ( + HuggingFaceTextEmbedding, +) +from semantic_kernel.exceptions import ServiceResponseException + + +def test_huggingface_text_embedding_initialization(): + model_name = "sentence-transformers/all-MiniLM-L6-v2" + device = -1 + + with patch( + "semantic_kernel.connectors.ai.hugging_face.services.hf_text_embedding.sentence_transformers.SentenceTransformer" + ) as mock_transformer: + mock_instance = mock_transformer.return_value + service = HuggingFaceTextEmbedding(service_id="test", ai_model_id=model_name, device=device) + + assert service.ai_model_id == model_name + assert service.device == "cpu" + assert service.generator == mock_instance + mock_transformer.assert_called_once_with(model_name_or_path=model_name, device="cpu") + + +@pytest.mark.asyncio +async def test_generate_embeddings_success(): + model_name = "sentence-transformers/all-MiniLM-L6-v2" + device = -1 + texts = ["Hello world!", "How are you?"] + mock_embeddings = array([[0.1, 0.2], [0.3, 0.4]]) + + with patch( + "semantic_kernel.connectors.ai.hugging_face.services.hf_text_embedding.sentence_transformers.SentenceTransformer" + ) as mock_transformer: + mock_instance = mock_transformer.return_value + mock_instance.encode.return_value = mock_embeddings + + service = HuggingFaceTextEmbedding(service_id="test", ai_model_id=model_name, device=device) + embeddings = await service.generate_embeddings(texts) + + assert isinstance(embeddings, ndarray) + assert embeddings.shape == (2, 2) + assert (embeddings == mock_embeddings).all() + + +@pytest.mark.asyncio +async def test_generate_embeddings_throws(): + model_name = "sentence-transformers/all-MiniLM-L6-v2" + device = -1 + texts = ["Hello world!", "How are you?"] + + with patch( + "semantic_kernel.connectors.ai.hugging_face.services.hf_text_embedding.sentence_transformers.SentenceTransformer" + ) as mock_transformer: + mock_instance = mock_transformer.return_value + mock_instance.encode.side_effect = Exception("Test exception") + + service = HuggingFaceTextEmbedding(service_id="test", ai_model_id=model_name, device=device) + + with pytest.raises(ServiceResponseException, match="Hugging Face embeddings failed"): + await service.generate_embeddings(texts)