From 0510cd06a008aa7d10b29c0a147dc479dac023ea Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Sat, 23 Mar 2024 07:58:09 -0400 Subject: [PATCH] validate NVIDIAEmbeddings.embed_documents input type is list[str] resolves https://github.com/langchain-ai/langchain-nvidia/issues/8 --- .../embeddings.py | 5 ++ libs/ai-endpoints/poetry.lock | 21 ++++- libs/ai-endpoints/pyproject.toml | 1 + .../tests/unit_tests/test_embeddings.py | 82 +++++++++++++++++++ 4 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 libs/ai-endpoints/tests/unit_tests/test_embeddings.py diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/embeddings.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/embeddings.py index 9fa8ed41..4bb7628a 100644 --- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/embeddings.py +++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/embeddings.py @@ -49,6 +49,11 @@ def embed_query(self, text: str) -> List[float]: def embed_documents(self, texts: List[str]) -> List[List[float]]: """Input pathway for document embeddings.""" + if not isinstance(texts, list) or not all( + isinstance(text, str) for text in texts + ): + raise ValueError(f"`texts` must be a list of strings, given: {repr(texts)}") + # From https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-foundation/models/nvolve-40k/documentation # The input must not exceed the 2048 max input characters and inputs above 512 # model tokens will be truncated. The input array must not exceed 50 input diff --git a/libs/ai-endpoints/poetry.lock b/libs/ai-endpoints/poetry.lock index 9e5d0bd3..41839978 100644 --- a/libs/ai-endpoints/poetry.lock +++ b/libs/ai-endpoints/poetry.lock @@ -1122,6 +1122,25 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-mock" +version = "1.11.0" +description = "Mock out responses from the requests package" +optional = false +python-versions = "*" +files = [ + {file = "requests-mock-1.11.0.tar.gz", hash = "sha256:ef10b572b489a5f28e09b708697208c4a3b2b89ef80a9f01584340ea357ec3c4"}, + {file = "requests_mock-1.11.0-py2.py3-none-any.whl", hash = "sha256:f7fae383f228633f6bececebdab236c478ace2284d6292c6e7e2867b9ab74d15"}, +] + +[package.dependencies] +requests = ">=2.3,<3" +six = "*" + +[package.extras] +fixture = ["fixtures"] +test = ["fixtures", "mock", "purl", "pytest", "requests-futures", "sphinx", "testtools"] + [[package]] name = "ruff" version = "0.1.15" @@ -1409,4 +1428,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "28b85aa52e8aaa1cb83dd8dfba132d118628a678bd9a228227a992394fedf7de" +content-hash = "90a276c531fed21a235d02a914b9bc6ce7425e3666053c184f1a423668d1bf22" diff --git a/libs/ai-endpoints/pyproject.toml b/libs/ai-endpoints/pyproject.toml index a0d01c5a..5c1197b2 100644 --- a/libs/ai-endpoints/pyproject.toml +++ b/libs/ai-endpoints/pyproject.toml @@ -27,6 +27,7 @@ syrupy = "^4.0.2" pytest-watcher = "^0.3.4" pytest-asyncio = "^0.21.1" langchain-core = "^0.1.5" +requests-mock = "^1.11.0" [tool.poetry.group.codespell] optional = true diff --git a/libs/ai-endpoints/tests/unit_tests/test_embeddings.py b/libs/ai-endpoints/tests/unit_tests/test_embeddings.py new file mode 100644 index 00000000..0cfd96cf --- /dev/null +++ b/libs/ai-endpoints/tests/unit_tests/test_embeddings.py @@ -0,0 +1,82 @@ +from typing import Generator + +import pytest +from requests_mock import Mocker + +from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings + + +@pytest.fixture +def embedding(requests_mock: Mocker) -> Generator[NVIDIAEmbeddings, None, None]: + model = "mock-model" + requests_mock.get( + "https://api.nvcf.nvidia.com/v2/nvcf/functions", + json={ + "functions": [ + { + "id": "ID", + "ncaId": "NCA-ID", + "versionId": "VERSION-ID", + "name": model, + "status": "ACTIVE", + "ownedByDifferentAccount": True, + "apiBodyFormat": "CUSTOM", + "healthUri": "/v2/health/ready", + "createdAt": "0000-00-00T00:00:00.000Z", + } + ] + }, + ) + requests_mock.post( + "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/ID", + json={ + "data": [ + { + "embedding": [ + 0.1, + 0.2, + 0.3, + ], + "index": 0, + } + ], + "usage": {"prompt_tokens": 8, "total_tokens": 8}, + }, + ) + yield NVIDIAEmbeddings(model=model, nvidia_api_key="a-bogus-key") + + +def test_embed_documents_negative_input_int(embedding: NVIDIAEmbeddings) -> None: + documents = 1 + with pytest.raises(ValueError): + embedding.embed_documents(documents) # type: ignore + + +def test_embed_documents_negative_input_float(embedding: NVIDIAEmbeddings) -> None: + documents = 1.0 + with pytest.raises(ValueError): + embedding.embed_documents(documents) # type: ignore + + +def test_embed_documents_negative_input_str(embedding: NVIDIAEmbeddings) -> None: + documents = "subscriptable string, not a list" + with pytest.raises(ValueError): + embedding.embed_documents(documents) # type: ignore + + +def test_embed_documents_negative_input_list_int(embedding: NVIDIAEmbeddings) -> None: + documents = [1, 2, 3] + with pytest.raises(ValueError): + embedding.embed_documents(documents) # type: ignore + + +def test_embed_documents_negative_input_list_float(embedding: NVIDIAEmbeddings) -> None: + documents = [1.0, 2.0, 3.0] + with pytest.raises(ValueError): + embedding.embed_documents(documents) # type: ignore + + +def test_embed_documents_negative_input_list_mixed(embedding: NVIDIAEmbeddings) -> None: + documents = ["1", 2.0, 3] + with pytest.raises(ValueError): + embedding.embed_documents(documents) # type: ignore