From cd7352d037dc9b045bdfe2ad7cbcad771f2c8e09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Wed, 4 Dec 2024 12:39:48 +0100 Subject: [PATCH 01/15] wip: enable document index url to be configured via env in tests --- env.sample => .env.example | 6 +++-- README.md | 2 +- tests/conftest.py | 19 ++++++++++------ .../document_index/test_document_index.py | 22 +++++++++++-------- 4 files changed, 30 insertions(+), 19 deletions(-) rename env.sample => .env.example (90%) diff --git a/env.sample b/.env.example similarity index 90% rename from env.sample rename to .env.example index 2b5e236e1..a7f982c22 100644 --- a/env.sample +++ b/.env.example @@ -1,4 +1,3 @@ -CLIENT_URL="https://api.aleph-alpha.com" ARGILLA_API_URL="http://localhost:6900/" ARGILLA_API_KEY="argilla.apikey" @@ -14,6 +13,9 @@ POSTGRES_USER=il_sdk POSTGRES_PASSWORD=test # things to adapt +CLIENT_URL=... +AA_TOKEN=token DATA_SERVICE_URL=... +DOCUMENT_INDEX_URL=... + HUGGING_FACE_TOKEN=token -AA_TOKEN=token diff --git a/README.md b/README.md index 5872bdac6..2ae87a8d2 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ The tutorials aim to guide you through implementing several common use-cases wit ### Setup LLM access -The tutorials require access to an LLM endpoint. You can choose between using the Aleph Alpha API (`https://api.aleph-alpha.com`) or an on-premise setup by configuring the appropriate environment variables. To configure the environment variables, create a `.env` file in the root directory of the project and copy the contents of the `.env.sample` file into it. +The tutorials require access to an LLM endpoint. You can choose between using the Aleph Alpha API (`https://api.aleph-alpha.com`) or an on-premise setup by configuring the appropriate environment variables. To configure the environment variables, create a `.env` file in the root directory of the project and copy the contents of the `.env.example` file into it. To use the **Aleph Alpha API**, that is set as the default host URL, set the `AA_TOKEN` variable to your [Aleph Alpha access token,](https://docs.aleph-alpha.com/docs/account/#create-a-new-token) and you are good to go. diff --git a/tests/conftest.py b/tests/conftest.py index 27e77e667..558e4605d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,4 @@ +import os from collections.abc import Sequence from os import getenv from pathlib import Path @@ -50,7 +51,9 @@ def client(token: str) -> AlephAlphaClientProtocol: token: AA Token """ return LimitedConcurrencyClient( - Client(token), max_concurrency=10, max_retry_time=2 * 60 + Client(token, host=os.environ["CLIENT_URL"]), + max_concurrency=10, + max_retry_time=10, ) @@ -61,7 +64,14 @@ def luminous_control_model(client: AlephAlphaClientProtocol) -> LuminousControlM @fixture(scope="session") def pharia_1_chat_model(client: AlephAlphaClientProtocol) -> Pharia1ChatModel: - return Pharia1ChatModel("Pharia-1-LLM-7B-control", client) + return Pharia1ChatModel("pharia-1-llm-7b-control", client) + + +@fixture(scope="session") +def document_index(token: str) -> DocumentIndexClient: + return DocumentIndexClient( + token, base_document_index_url=os.environ["DOCUMENT_INDEX_URL"] + ) @fixture @@ -101,11 +111,6 @@ def symmetric_in_memory_retriever( ) -@fixture -def document_index(token: str) -> DocumentIndexClient: - return DocumentIndexClient(token) - - @fixture def document_index_retriever( document_index: DocumentIndexClient, diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index a15d56b29..d74304403 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -125,13 +125,14 @@ def document_index_namespace() -> str: @fixture(scope="session", autouse=True) -def _teardown(token: str, document_index_namespace: str) -> Iterator[None]: +def _teardown( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[None]: yield # Cleanup leftover resources from previous runs. timestamp_threshold = datetime.now(timezone.utc) - timedelta(hours=1) - document_index = DocumentIndexClient(token) collections = document_index.list_collections(document_index_namespace) for collection_path in collections: if is_outdated_identifier(collection_path.collection, timestamp_threshold): @@ -188,13 +189,11 @@ def random_collection_path( @fixture(scope="session") def read_only_collection_path( - token: str, + document_index: DocumentIndexClient, document_index_namespace: str, document_contents_with_metadata: list[DocumentContents], filter_index_config: dict[str, dict[str, str]], ) -> Iterator[CollectionPath]: - document_index = DocumentIndexClient(token) - name = random_identifier() collection_path = CollectionPath( namespace=document_index_namespace, collection=name @@ -359,7 +358,9 @@ def test_document_index_sets_no_authorization_header_when_token_is_none() -> Non @pytest.mark.internal -def test_document_index_lists_namespaces(document_index: DocumentIndexClient) -> None: +def test_document_index_lists_namespaces( + document_index: DocumentIndexClient, +) -> None: namespaces = document_index.list_namespaces() assert "aleph-alpha" in namespaces @@ -537,7 +538,8 @@ def test_document_path_from_string( def test_document_list_all_documents( - document_index: DocumentIndexClient, read_only_collection_path: CollectionPath + document_index: DocumentIndexClient, + read_only_collection_path: CollectionPath, ) -> None: filter_result = document_index.documents(read_only_collection_path) @@ -545,7 +547,8 @@ def test_document_list_all_documents( def test_document_list_max_n_documents( - document_index: DocumentIndexClient, read_only_collection_path: CollectionPath + document_index: DocumentIndexClient, + read_only_collection_path: CollectionPath, ) -> None: filter_query_params = DocumentFilterQueryParams(max_documents=1, starts_with=None) @@ -628,7 +631,8 @@ def test_instructable_indexes_in_namespace_are_returned( def test_indexes_for_collection_are_returned( - document_index: DocumentIndexClient, read_only_collection_path: CollectionPath + document_index: DocumentIndexClient, + read_only_collection_path: CollectionPath, ) -> None: index_names = document_index.list_assigned_index_names(read_only_collection_path) assert "ci-intelligence-layer" in index_names From f2dabf2e6e295c02acf6b99e238b0ce260fa7bd5 Mon Sep 17 00:00:00 2001 From: Michael Barlow <25936840+Michael-JB@users.noreply.github.com> Date: Thu, 5 Dec 2024 14:33:41 +0100 Subject: [PATCH 02/15] fix: migrate Document Index tests to p-prod - migrate tests to use p-prod instance - tests no longer assume existing state - tests always create and clean up resources per run - filter indexes are now also cleaned up - minor typo fixes --- .../document_index/document_index.py | 4 +- .../document_index/test_document_index.py | 604 +++++++++++------- 2 files changed, 358 insertions(+), 250 deletions(-) diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index ad0bfd950..5b83a5acc 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -24,8 +24,8 @@ class IndexPath(BaseModel, frozen=True): """Path to an index. Args: - namespace: Holds collections. - index: The name of the index, holds a config. + namespace: The namespace to which this index belongs. + index: The name of the index. """ namespace: str diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index d74304403..f444d88ba 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -41,20 +41,20 @@ @overload def retry( - func: None = None, max_retries: int = 3, secondy_delay: float = 0.0 + func: None = None, max_retries: int = 3, seconds_delay: float = 0.0 ) -> Callable[[Callable[P, R]], Callable[P, R]]: ... @overload def retry( - func: Callable[P, R], max_retries: int = 3, secondy_delay: float = 0.0 + func: Callable[P, R], max_retries: int = 3, seconds_delay: float = 0.0 ) -> Callable[P, R]: ... def retry( func: Callable[P, R] | None = None, max_retries: int = 25, - secondy_delay: float = 0.2, + seconds_delay: float = 0.2, ) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]: def decorator(func: Callable[P, R]) -> Callable[P, R]: @wraps(func) @@ -64,7 +64,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: return func(*args, **kwargs) except Exception as e: last_exception = e - sleep(secondy_delay) + sleep(seconds_delay) raise last_exception @@ -81,7 +81,7 @@ def random_alphanumeric_string(length: int = 20) -> str: def random_identifier() -> str: - name = random_alphanumeric_string(20) + name = random_alphanumeric_string(10) timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") return f"ci-il-{name}-{timestamp}" @@ -119,9 +119,72 @@ def random_embedding_config() -> EmbeddingConfig: return random.choice([random_semantic_embed(), random_instructable_embed()]) +@fixture +def document_contents() -> DocumentContents: + text = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change. + +Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies. + +Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief. + +In the post-war years, Pemberton relocated to Atlanta, Georgia, where he continued to experiment with various medicinal syrups and tonics. It was during this time, in the late 19th century, that he developed a beverage he initially called "Pemberton's French Wine Coca." This concoction was inspired by Vin Mariani, a popular French tonic wine that contained coca leaves. Pemberton's beverage was intended to serve not just as a refreshing drink but also as a remedy for various ailments, including morphine addiction, indigestion, and headaches. + +However, in 1886, when Atlanta introduced prohibition legislation, Pemberton was compelled to create a non-alcoholic version of his beverage. He experimented with a combination of carbonated water, coca leaf extract, kola nut, and other ingredients, eventually perfecting the formula for what would soon become Coca-Cola. The name was suggested by his bookkeeper, Frank Robinson, who also created the distinctive cursive logo that is still in use today. + +Pemberton advertised his new creation as a "brain tonic" and "temperance drink," asserting that it could alleviate headaches and fatigue. However, due to his declining health and financial difficulties, Pemberton was eventually compelled to sell portions of his business to various partners. Shortly before his death in 1888, he sold his remaining stake in Coca-Cola to Asa G. Candler, a fellow pharmacist and businessman. + +Under Candler's leadership, Coca-Cola transformed from a pharmacist's concoction into a mass-produced and marketed beverage that became a staple of American culture and a global icon. Despite the changes and the immense growth of the brand, the legacy of John Stith Pemberton as the inventor of Coca-Cola remains an integral part of the beverage's history. + +Pemberton's life story is a testament to the spirit of innovation and resilience. His creation, borne out of personal struggles and the context of his times, went on to transcend its origins and become a symbol recognized across the globe. Today, when we think of Coca-Cola, we are reminded of Pemberton's journey from a small-town pharmacist to the creator of one of the world's most enduring and beloved brands.""" + return DocumentContents(contents=[text], metadata={"Some": "Metadata"}) + + +@fixture(scope="session") +def document_contents_with_metadata() -> list[DocumentContents]: + text_1 = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change.""" + text_2 = """Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies.""" + text_3 = """Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief.""" + + metadata_1: JsonSerializable = { + "string-field": "example_string_1", + "integer-field": 123, + "float-field": 123.45, + "boolean-field": True, + "date-field": datetime(2022, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + metadata_2: JsonSerializable = { + "string-field": "example_string_2", + "integer-field": 456, + "float-field": 678.90, + "boolean-field": False, + "date-field": datetime(2023, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + metadata_3: JsonSerializable = { + "string-field": "example_string_3", + "integer-field": 789, + "float-field": 101112.13, + "boolean-field": True, + "date-field": datetime(2024, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + return [ + DocumentContents(contents=[text_1], metadata=metadata_1), + DocumentContents(contents=[text_2], metadata=metadata_2), + DocumentContents(contents=[text_3], metadata=metadata_3), + ] + + @fixture(scope="session") def document_index_namespace() -> str: - return "team-document-index" + return "Search" @fixture(scope="session", autouse=True) @@ -143,92 +206,42 @@ def _teardown( if is_outdated_identifier(index_path.index, timestamp_threshold): document_index.delete_index(index_path) + filter_indexes = document_index.list_filter_indexes_in_namespace( + document_index_namespace + ) + for filter_index in filter_indexes: + if is_outdated_identifier(filter_index, timestamp_threshold): + document_index.delete_filter_index_from_namespace( + document_index_namespace, filter_index + ) + @fixture(scope="session") -def filter_index_config() -> dict[str, dict[str, str]]: +def filter_index_configs() -> dict[str, dict[str, str]]: return { - "test-string-filter": { + random_identifier(): { "field-name": "string-field", "field-type": "string", }, - "test-integer-filter": { + random_identifier(): { "field-name": "integer-field", "field-type": "integer", }, - "test-float-filter": { + random_identifier(): { "field-name": "float-field", "field-type": "float", }, - "test-boolean-filter": { + random_identifier(): { "field-name": "boolean-field", "field-type": "boolean", }, - "test-date-filter": { + random_identifier(): { "field-name": "date-field", "field-type": "date_time", }, } -@fixture -def random_collection_path( - document_index: DocumentIndexClient, - document_index_namespace: str, -) -> Iterator[CollectionPath]: - name = random_identifier() - collection_path = CollectionPath( - namespace=document_index_namespace, collection=name - ) - try: - document_index.create_collection(collection_path) - - yield collection_path - finally: - document_index.delete_collection(collection_path) - - -@fixture(scope="session") -def read_only_collection_path( - document_index: DocumentIndexClient, - document_index_namespace: str, - document_contents_with_metadata: list[DocumentContents], - filter_index_config: dict[str, dict[str, str]], -) -> Iterator[CollectionPath]: - name = random_identifier() - collection_path = CollectionPath( - namespace=document_index_namespace, collection=name - ) - try: - document_index.create_collection(collection_path) - - # Add 3 documents - for i, content in enumerate(document_contents_with_metadata): - document_index.add_document( - DocumentPath( - collection_path=collection_path, - document_name=f"document-metadata-{i}", - ), - content, - ) - - # Assign index - document_index.assign_index_to_collection( - collection_path, "ci-intelligence-layer" - ) - - # Assign filter indexes - for filter_index in filter_index_config: - document_index.assign_filter_index_to_search_index( - collection_path=collection_path, - index_name="ci-intelligence-layer", - filter_index_name=filter_index, - ) - - yield collection_path - finally: - document_index.delete_collection(collection_path) - - @contextmanager def random_index_with_embedding_config( document_index: DocumentIndexClient, @@ -279,66 +292,171 @@ def random_semantic_index( @fixture -def document_contents() -> DocumentContents: - text = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change. +def random_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, + document_index_namespace, + random.choice([random_semantic_embed(), random_instructable_embed()]), + ) as index: + yield index -Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies. -Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief. +@fixture +def random_filter_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[str]: + name = random_identifier() + field_name = random_identifier() + field_type = random.choice(["string", "integer", "float", "boolean", "date_time"]) -In the post-war years, Pemberton relocated to Atlanta, Georgia, where he continued to experiment with various medicinal syrups and tonics. It was during this time, in the late 19th century, that he developed a beverage he initially called "Pemberton's French Wine Coca." This concoction was inspired by Vin Mariani, a popular French tonic wine that contained coca leaves. Pemberton's beverage was intended to serve not just as a refreshing drink but also as a remedy for various ailments, including morphine addiction, indigestion, and headaches. + try: + document_index.create_filter_index_in_namespace( + namespace=document_index_namespace, + filter_index_name=name, + field_name=field_name, + field_type=field_type, # type:ignore[arg-type], + ) -However, in 1886, when Atlanta introduced prohibition legislation, Pemberton was compelled to create a non-alcoholic version of his beverage. He experimented with a combination of carbonated water, coca leaf extract, kola nut, and other ingredients, eventually perfecting the formula for what would soon become Coca-Cola. The name was suggested by his bookkeeper, Frank Robinson, who also created the distinctive cursive logo that is still in use today. + yield name + finally: + document_index.delete_filter_index_from_namespace( + document_index_namespace, name + ) -Pemberton advertised his new creation as a "brain tonic" and "temperance drink," asserting that it could alleviate headaches and fatigue. However, due to his declining health and financial difficulties, Pemberton was eventually compelled to sell portions of his business to various partners. Shortly before his death in 1888, he sold his remaining stake in Coca-Cola to Asa G. Candler, a fellow pharmacist and businessman. -Under Candler's leadership, Coca-Cola transformed from a pharmacist's concoction into a mass-produced and marketed beverage that became a staple of American culture and a global icon. Despite the changes and the immense growth of the brand, the legacy of John Stith Pemberton as the inventor of Coca-Cola remains an integral part of the beverage's history. +@fixture +def random_collection( + document_index: DocumentIndexClient, + document_index_namespace: str, +) -> Iterator[CollectionPath]: + collection_name = random_identifier() + collection_path = CollectionPath( + namespace=document_index_namespace, collection=collection_name + ) + try: + document_index.create_collection(collection_path) -Pemberton's life story is a testament to the spirit of innovation and resilience. His creation, borne out of personal struggles and the context of his times, went on to transcend its origins and become a symbol recognized across the globe. Today, when we think of Coca-Cola, we are reminded of Pemberton's journey from a small-town pharmacist to the creator of one of the world's most enduring and beloved brands.""" - return DocumentContents(contents=[text], metadata={"Some": "Metadata"}) + yield collection_path + finally: + document_index.delete_collection(collection_path) @fixture(scope="session") -def document_contents_with_metadata() -> list[DocumentContents]: - text_1 = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change.""" - text_2 = """Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies.""" - text_3 = """Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief.""" +def read_only_populated_collection( + document_index: DocumentIndexClient, + document_index_namespace: str, + document_contents_with_metadata: list[DocumentContents], + filter_index_configs: dict[str, dict[str, str]], +) -> Iterator[tuple[CollectionPath, IndexPath]]: + index_name = random_identifier() + index_path = IndexPath(namespace=document_index_namespace, index=index_name) + index_configuration = IndexConfiguration( + chunk_size=512, + chunk_overlap=0, + hybrid_index="bm25", + embedding=SemanticEmbed( + representation="asymmetric", + model_name="luminous-base", + ), + ) - metadata_1: JsonSerializable = { - "string-field": "example_string_1", - "integer-field": 123, - "float-field": 123.45, - "boolean-field": True, - "date-field": datetime(2022, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } + collection_name = random_identifier() + collection_path = CollectionPath( + namespace=document_index_namespace, collection=collection_name + ) - metadata_2: JsonSerializable = { - "string-field": "example_string_2", - "integer-field": 456, - "float-field": 678.90, - "boolean-field": False, - "date-field": datetime(2023, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } + try: + document_index.create_collection(collection_path) + document_index.create_index(index_path, index_configuration) + document_index.assign_index_to_collection(collection_path, index_name) + + for name, config in filter_index_configs.items(): + document_index.create_filter_index_in_namespace( + namespace=document_index_namespace, + filter_index_name=name, + field_name=config["field-name"], + field_type=config["field-type"], # type:ignore[arg-type] + ) + document_index.assign_filter_index_to_search_index( + collection_path=collection_path, + index_name=index_name, + filter_index_name=name, + ) - metadata_3: JsonSerializable = { - "string-field": "example_string_3", - "integer-field": 789, - "float-field": 101112.13, - "boolean-field": True, - "date-field": datetime(2024, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } + for i, content in enumerate(document_contents_with_metadata): + document_index.add_document( + DocumentPath( + collection_path=collection_path, + document_name=f"document-{i}", + ), + content, + ) - return [ - DocumentContents(contents=[text_1], metadata=metadata_1), - DocumentContents(contents=[text_2], metadata=metadata_2), - DocumentContents(contents=[text_3], metadata=metadata_3), - ] + yield collection_path, index_path + finally: + document_index.delete_collection(collection_path) + + @retry + def clean_up_indexes() -> None: + document_index.delete_index(index_path) + for filter_index_name in filter_index_configs: + document_index.delete_filter_index_from_namespace( + document_index_namespace, filter_index_name + ) + + clean_up_indexes() + + +@fixture +def random_searchable_collection( + document_index: DocumentIndexClient, + document_contents_with_metadata: list[DocumentContents], + random_index: tuple[IndexPath, IndexConfiguration], + random_collection: CollectionPath, +) -> Iterator[tuple[CollectionPath, IndexPath]]: + index_path, _ = random_index + index_name = index_path.index + collection_path = random_collection + + try: + # Assign index + document_index.assign_index_to_collection(collection_path, index_name) + + # Add 3 documents + for i, content in enumerate(document_contents_with_metadata): + document_index.add_document( + DocumentPath( + collection_path=collection_path, + document_name=f"document-{i}", + ), + content, + ) + + # Ensure documents are searchable; this allows time for indexing + @retry + def search() -> None: + search_result = document_index.search( + collection_path, + index_name, + SearchQuery( + query="Coca-Cola", + ), + ) + assert len(search_result) > 0 + + search() + + yield collection_path, index_path + finally: + document_index.delete_collection(collection_path) + + @retry + def clean_up_index() -> None: + document_index.delete_index(index_path) + + clean_up_index() @pytest.mark.internal @@ -360,19 +478,20 @@ def test_document_index_sets_no_authorization_header_when_token_is_none() -> Non @pytest.mark.internal def test_document_index_lists_namespaces( document_index: DocumentIndexClient, + document_index_namespace: str, ) -> None: namespaces = document_index.list_namespaces() - assert "aleph-alpha" in namespaces + assert document_index_namespace in namespaces @pytest.mark.internal def test_document_index_gets_collection( - document_index: DocumentIndexClient, random_collection_path: CollectionPath + document_index: DocumentIndexClient, random_collection: CollectionPath ) -> None: - collections = document_index.list_collections(random_collection_path.namespace) + collections = document_index.list_collections(random_collection.namespace) - assert random_collection_path in collections + assert random_collection in collections @pytest.mark.internal @@ -385,86 +504,44 @@ def test_document_index_gets_collection( ) def test_document_index_adds_document( document_index: DocumentIndexClient, - random_collection_path: CollectionPath, + random_collection: CollectionPath, document_contents: DocumentContents, document_name: str, ) -> None: document_path = DocumentPath( - collection_path=random_collection_path, + collection_path=random_collection, document_name=document_name, ) document_index.add_document(document_path, document_contents) assert any( d.document_path == document_path - for d in document_index.documents(random_collection_path) + for d in document_index.documents(random_collection) ) assert document_contents == document_index.document(document_path) @pytest.mark.internal -def test_document_index_searches_asymmetrically( - document_index: DocumentIndexClient, random_collection_path: CollectionPath -) -> None: - document_path = DocumentPath( - collection_path=random_collection_path, - document_name="test_document_index_searches_asymmetrically", - ) - document_contents = DocumentContents.from_text("Mark likes pizza.") - document_index.add_document(document_path, document_contents) - - document_index.assign_index_to_collection( - collection_path=random_collection_path, index_name="ci-intelligence-layer" - ) - - search_query = SearchQuery(query="Who likes pizza?", max_results=1, min_score=0.0) - - @retry - def search() -> None: - search_result = document_index.search( - document_path.collection_path, "ci-intelligence-layer", search_query - ) - - assert "Mark" in search_result[0].section - - search() - - -def test_document_index_hybrid_search_combines_semantic_and_keyword_search( - document_index: DocumentIndexClient, random_collection_path: CollectionPath +def test_document_index_searches( + document_index: DocumentIndexClient, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: - document_index.assign_index_to_collection( - random_collection_path, "ci-intelligence-layer-hybrid" - ) - - document_path = DocumentPath( - collection_path=random_collection_path, - document_name="test_document_index_hybrid_search_combines_semantic_and_keyword_search", - ) - document_contents = DocumentContents( - contents=[ - "Infant and baby are synonyms. Baby is also an informal term for a lover or spouse.", - "The infant was crying because it was hungry.", - "People cry when they are sad or hurt.", - ], - ) - document_index.add_document(document_path, document_contents) - + collection, index = read_only_populated_collection search_query = SearchQuery( - query="Why is the baby crying?", - max_results=3, + query="Pemberton began his professional journey by studying medicine and pharmacy.", + max_results=1, min_score=0.0, ) @retry def search() -> None: - search_results = document_index.search( - document_path.collection_path, "ci-intelligence-layer-hybrid", search_query + search_result = document_index.search( + collection, + index.index, + search_query, ) - assert "The infant was crying because" in search_results[0].section - assert "Infant and baby are synonyms" in search_results[1].section - assert "People cry" in search_results[2].section + assert search_query.query in search_result[0].section search() @@ -479,13 +556,13 @@ def search() -> None: ) def test_document_index_deletes_document( document_index: DocumentIndexClient, - random_collection_path: CollectionPath, + random_collection: CollectionPath, + document_contents: DocumentContents, document_name: str, ) -> None: document_path = DocumentPath( - collection_path=random_collection_path, document_name=document_name + collection_path=random_collection, document_name=document_name ) - document_contents = DocumentContents.from_text("Some text...") document_index.add_document(document_path, document_contents) document_index.delete_document(document_path) @@ -495,10 +572,12 @@ def test_document_index_deletes_document( def test_document_index_raises_on_getting_non_existing_document( - document_index: DocumentIndexClient, + document_index: DocumentIndexClient, document_index_namespace: str ) -> None: non_existing_document = DocumentPath( - collection_path=CollectionPath(namespace="does", collection="not"), + collection_path=CollectionPath( + namespace=document_index_namespace, collection="not" + ), document_name="exist", ) with raises(ResourceNotFound) as exception_info: @@ -539,38 +618,38 @@ def test_document_path_from_string( def test_document_list_all_documents( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: - filter_result = document_index.documents(read_only_collection_path) + filter_result = document_index.documents(read_only_populated_collection[0]) assert len(filter_result) == 3 def test_document_list_max_n_documents( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: filter_query_params = DocumentFilterQueryParams(max_documents=1, starts_with=None) filter_result = document_index.documents( - read_only_collection_path, filter_query_params + read_only_populated_collection[0], filter_query_params ) assert len(filter_result) == 1 def test_document_list_documents_with_matching_prefix( - document_index: DocumentIndexClient, random_collection_path: CollectionPath + document_index: DocumentIndexClient, random_collection: CollectionPath ) -> None: document_index.add_document( document_path=DocumentPath( - collection_path=random_collection_path, document_name="Example document" + collection_path=random_collection, document_name="Example document" ), contents=DocumentContents.from_text("Document with matching prefix"), ) document_index.add_document( document_path=DocumentPath( - collection_path=random_collection_path, document_name="Another document" + collection_path=random_collection, document_name="Another document" ), contents=DocumentContents.from_text("Document without matching prefix"), ) @@ -579,9 +658,7 @@ def test_document_list_documents_with_matching_prefix( max_documents=None, starts_with=prefix ) - filter_result = document_index.documents( - random_collection_path, filter_query_params - ) + filter_result = document_index.documents(random_collection, filter_query_params) assert len(filter_result) == 1 assert filter_result[0].document_path.document_name.startswith(prefix) @@ -632,18 +709,20 @@ def test_instructable_indexes_in_namespace_are_returned( def test_indexes_for_collection_are_returned( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: - index_names = document_index.list_assigned_index_names(read_only_collection_path) - assert "ci-intelligence-layer" in index_names + index_names = document_index.list_assigned_index_names( + read_only_populated_collection[0] + ) + assert read_only_populated_collection[1].index in index_names def test_create_filter_indexes_in_namespace( document_index: DocumentIndexClient, document_index_namespace: str, - filter_index_config: dict[str, dict[str, str]], + filter_index_configs: dict[str, dict[str, str]], ) -> None: - for index_name, index_config in filter_index_config.items(): + for index_name, index_config in filter_index_configs.items(): document_index.create_filter_index_in_namespace( namespace=document_index_namespace, filter_index_name=index_name, @@ -654,7 +733,7 @@ def test_create_filter_indexes_in_namespace( assert all( filter_index in document_index.list_filter_indexes_in_namespace(document_index_namespace) - for filter_index in filter_index_config + for filter_index in filter_index_configs ) @@ -685,51 +764,50 @@ def test_create_filter_index_name_too_long( def test_assign_filter_indexes_to_collection( document_index: DocumentIndexClient, - random_collection_path: CollectionPath, - filter_index_config: dict[str, dict[str, str]], + random_searchable_collection: tuple[CollectionPath, IndexPath], + filter_index_configs: dict[str, dict[str, str]], ) -> None: - document_index.assign_index_to_collection( - collection_path=random_collection_path, index_name="ci-intelligence-layer" - ) + collection_path, index_path = random_searchable_collection + index_name = index_path.index - for index_name in filter_index_config: + for filter_index_name in filter_index_configs: document_index.assign_filter_index_to_search_index( - collection_path=random_collection_path, - filter_index_name=index_name, - index_name="ci-intelligence-layer", + collection_path=collection_path, + index_name=index_name, + filter_index_name=filter_index_name, ) + assigned_indexes = document_index.list_assigned_filter_index_names( + collection_path, index_name + ) + assert all( - filter_index - in document_index.list_assigned_filter_index_names( - random_collection_path, "ci-intelligence-layer" - ) - for filter_index in filter_index_config + filter_index in assigned_indexes for filter_index in filter_index_configs ) def test_document_index_adds_documents_with_metadata( document_index: DocumentIndexClient, - random_collection_path: CollectionPath, + random_collection: CollectionPath, document_contents_with_metadata: list[DocumentContents], ) -> None: for i, doc_content in enumerate(document_contents_with_metadata): document_path = DocumentPath( - collection_path=random_collection_path, + collection_path=random_collection, document_name=f"document-metadata-{i}", ) document_index.add_document(document_path, doc_content) assert any( d.document_path == document_path - for d in document_index.documents(random_collection_path) + for d in document_index.documents(random_collection) ) assert doc_content == document_index.document(document_path) def test_search_with_string_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -751,17 +829,20 @@ def test_search_with_string_filter( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) - assert results[0].document_path.document_name == "document-metadata-0" + assert results[0].document_path.document_name == "document-0" search() def test_search_with_integer_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -783,18 +864,21 @@ def test_search_with_integer_filter( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 1 - assert results[0].document_path.document_name == "document-metadata-0" + assert results[0].document_path.document_name == "document-0" search() def test_search_with_float_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -816,19 +900,22 @@ def test_search_with_float_filter( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 2 - assert results[0].document_path.document_name == "document-metadata-1" - assert results[1].document_path.document_name == "document-metadata-2" + assert results[0].document_path.document_name == "document-1" + assert results[1].document_path.document_name == "document-2" search() def test_search_with_boolean_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -850,18 +937,21 @@ def test_search_with_boolean_filter( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 1 - assert results[0].document_path.document_name == "document-metadata-0" + assert results[0].document_path.document_name == "document-0" search() def test_search_with_datetime_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -883,18 +973,21 @@ def test_search_with_datetime_filter( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 1 - assert results[0].document_path.document_name == "document-metadata-0" + assert results[0].document_path.document_name == "document-0" search() def test_search_with_invalid_datetime_filter( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -914,14 +1007,17 @@ def test_search_with_invalid_datetime_filter( ], ) with raises(InvalidInput): + collection_path, index_path = read_only_populated_collection document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) def test_search_with_multiple_filters( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -948,18 +1044,21 @@ def test_search_with_multiple_filters( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 1 - assert results[0].document_path.document_name == "document-metadata-0" + assert results[0].document_path.document_name == "document-0" search() def test_search_with_filter_type_without( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -981,8 +1080,11 @@ def test_search_with_filter_type_without( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 2 @@ -991,7 +1093,7 @@ def search() -> None: def test_search_with_filter_type_without_and_with( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -1023,19 +1125,22 @@ def test_search_with_filter_type_without_and_with( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 2 - assert results[0].document_path.document_name == "document-metadata-0" - assert results[1].document_path.document_name == "document-metadata-2" + assert results[0].document_path.document_name == "document-0" + assert results[1].document_path.document_name == "document-2" search() def test_search_with_filter_type_with_one_of( document_index: DocumentIndexClient, - read_only_collection_path: CollectionPath, + read_only_populated_collection: tuple[CollectionPath, IndexPath], ) -> None: search_query = SearchQuery( query="Coca-Cola", @@ -1067,17 +1172,20 @@ def test_search_with_filter_type_with_one_of( @retry def search() -> None: + collection_path, index_path = read_only_populated_collection results = document_index.search( - read_only_collection_path, "ci-intelligence-layer", search_query + collection_path, + index_path.index, + search_query, ) assert len(results) == 2 - assert results[0].document_path.document_name == "document-metadata-1" - assert results[1].document_path.document_name == "document-metadata-2" + assert results[0].document_path.document_name == "document-1" + assert results[1].document_path.document_name == "document-2" search() def test_document_indexes_works( - document_index: DocumentIndexClient, random_collection_path: CollectionPath + document_index: DocumentIndexClient, random_collection: CollectionPath ) -> None: - document_index.progress(random_collection_path) + document_index.progress(random_collection) From 7d6ae7c80e273fec494b3fff6441887e1e53caee Mon Sep 17 00:00:00 2001 From: Michael Barlow <25936840+Michael-JB@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:40:06 +0100 Subject: [PATCH 03/15] test: Increase retry time in DI tests --- tests/connectors/document_index/test_document_index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index f444d88ba..3d4759ce8 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -53,8 +53,8 @@ def retry( def retry( func: Callable[P, R] | None = None, - max_retries: int = 25, - seconds_delay: float = 0.2, + max_retries: int = 60, + seconds_delay: float = 0.5, ) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]: def decorator(func: Callable[P, R]) -> Callable[P, R]: @wraps(func) From fe65883ca4a177791cb64985a0df0dc2583ad625 Mon Sep 17 00:00:00 2001 From: Michael Barlow <25936840+Michael-JB@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:44:09 +0100 Subject: [PATCH 04/15] test: remove unused fixture --- .../document_index/test_document_index.py | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index 3d4759ce8..5434d8633 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -303,29 +303,6 @@ def random_index( yield index -@fixture -def random_filter_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[str]: - name = random_identifier() - field_name = random_identifier() - field_type = random.choice(["string", "integer", "float", "boolean", "date_time"]) - - try: - document_index.create_filter_index_in_namespace( - namespace=document_index_namespace, - filter_index_name=name, - field_name=field_name, - field_type=field_type, # type:ignore[arg-type], - ) - - yield name - finally: - document_index.delete_filter_index_from_namespace( - document_index_namespace, name - ) - - @fixture def random_collection( document_index: DocumentIndexClient, From ab7be5f8c27c2912a48438992a32635c7b8ec709 Mon Sep 17 00:00:00 2001 From: Michael Barlow <25936840+Michael-JB@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:56:37 +0100 Subject: [PATCH 05/15] test: pull DI URL from environment in doctest --- src/documentation/document_index.ipynb | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb index 3549a7f18..8c4f1b97e 100644 --- a/src/documentation/document_index.ipynb +++ b/src/documentation/document_index.ipynb @@ -61,9 +61,7 @@ "source": [ "## Upload documents to the Document Index\n", "\n", - "To search through the DI, you'll first need to upload the documents to it.\n", - "For now, we'll use the [DI instance hosted by Aleph Alpha](https://app.document-index.aleph-alpha.com).\n", - "We assume you have an assigned namespace and possess a token to access it." + "To search through the DI, you'll first need to upload the documents to it. We assume that the URL of your DI instance is available under the `DOCUMENT_INDEX_URL` environment variable, and that you already have a namespace and a token to access it." ] }, { @@ -72,8 +70,8 @@ "metadata": {}, "outputs": [], "source": [ - "# specify this for your own namespace\n", - "NAMESPACE = \"aleph-alpha\"" + "# change this to your namespace\n", + "NAMESPACE = \"Search\"" ] }, { @@ -84,7 +82,7 @@ "source": [ "document_index = DocumentIndexClient(\n", " token=getenv(\"AA_TOKEN\"),\n", - " base_document_index_url=\"https://document-index.aleph-alpha.com\",\n", + " base_document_index_url=getenv(\"DOCUMENT_INDEX_URL\"),\n", ")" ] }, @@ -659,13 +657,6 @@ "source": [ "tracer" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 9ee16321a4ca211ba828b7a6ceb708c7f5f6bc08 Mon Sep 17 00:00:00 2001 From: Michael Barlow <25936840+Michael-JB@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:22:01 +0100 Subject: [PATCH 06/15] test: update DI random identifier --- tests/connectors/document_index/test_document_index.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index 5434d8633..ef7a4936d 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -83,13 +83,14 @@ def random_alphanumeric_string(length: int = 20) -> str: def random_identifier() -> str: name = random_alphanumeric_string(10) timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") - return f"ci-il-{name}-{timestamp}" + return f"intelligence-layer-ci-{name}-{timestamp}" def is_outdated_identifier(identifier: str, timestamp_threshold: datetime) -> bool: # match the format that is defined in random_identifier() matched = re.match( - r"^ci-il-[a-zA-Z0-9]{20}-(?P\d{8}T\d{6})$", identifier + r"^intelligence-layer-ci-[a-zA-Z0-9]{10}-(?P\d{8}T\d{6})$", + identifier, ) if matched is None: return False From 33c9e698873d1368fca41825bc88acbebbda930b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Tue, 10 Dec 2024 16:13:17 +0100 Subject: [PATCH 07/15] wip: migrate broken tests to new env variables --- tests/conftest.py | 448 +++++++++++++++++- .../document_index/test_document_index.py | 420 +--------------- .../test_document_index_retriever.py | 16 +- tests/examples/qa/test_retriever_based_qa.py | 13 - tests/examples/search/test_expand_chunk.py | 31 +- .../summarize/test_recursive_summarize.py | 9 +- 6 files changed, 466 insertions(+), 471 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 558e4605d..4ace913a9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,15 @@ import os -from collections.abc import Sequence +import random +import re +import string +from collections.abc import Callable, Iterable, Iterator, Sequence +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +from functools import wraps from os import getenv from pathlib import Path -from typing import cast +from time import sleep +from typing import ParamSpec, TypeVar, cast, get_args, overload from aleph_alpha_client import Client, Image from dotenv import load_dotenv @@ -18,6 +25,20 @@ QdrantInMemoryRetriever, RetrieverType, ) +from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.document_index.document_index import ( + CollectionPath, + DocumentContents, + DocumentPath, + EmbeddingConfig, + HybridIndex, + IndexConfiguration, + IndexPath, + InstructableEmbed, + Representation, + SearchQuery, + SemanticEmbed, +) from intelligence_layer.core import ( LuminousControlModel, NoOpTracer, @@ -44,14 +65,14 @@ def token() -> str: @fixture(scope="session") -def client(token: str) -> AlephAlphaClientProtocol: - """Provide fixture for api. +def inference_url() -> str: + return os.environ["CLIENT_URL"] + - Args: - token: AA Token - """ +@fixture(scope="session") +def client(token: str, inference_url: str) -> AlephAlphaClientProtocol: return LimitedConcurrencyClient( - Client(token, host=os.environ["CLIENT_URL"]), + Client(token, host=inference_url), max_concurrency=10, max_retry_time=10, ) @@ -111,15 +132,420 @@ def symmetric_in_memory_retriever( ) +# document index setup +P = ParamSpec("P") +R = TypeVar("R") + + +@overload +def retry( + func: None = None, max_retries: int = 3, seconds_delay: float = 0.0 +) -> Callable[[Callable[P, R]], Callable[P, R]]: ... + + +@overload +def retry( + func: Callable[P, R], max_retries: int = 3, seconds_delay: float = 0.0 +) -> Callable[P, R]: ... + + +def retry( + func: Callable[P, R] | None = None, + max_retries: int = 60, + seconds_delay: float = 0.5, +) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]: + def decorator(func: Callable[P, R]) -> Callable[P, R]: + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + for _ in range(1 + max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + last_exception = e + sleep(seconds_delay) + + raise last_exception + + return wrapper + + if func is None: + return decorator + else: + return decorator(func) + + +def random_alphanumeric_string(length: int = 20) -> str: + return "".join(random.choices(string.ascii_letters + string.digits, k=length)) + + +def random_identifier() -> str: + name = random_alphanumeric_string(10) + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") + return f"intelligence-layer-ci-{name}-{timestamp}" + + +def is_outdated_identifier(identifier: str, timestamp_threshold: datetime) -> bool: + # match the format that is defined in random_identifier() + matched = re.match( + r"^intelligence-layer-ci-[a-zA-Z0-9]{10}-(?P\d{8}T\d{6})$", + identifier, + ) + if matched is None: + return False + + timestamp = datetime.strptime(matched["timestamp"], "%Y%m%dT%H%M%S").replace( + tzinfo=timezone.utc + ) + return not timestamp > timestamp_threshold + + +def random_semantic_embed() -> EmbeddingConfig: + return SemanticEmbed( + representation=random.choice(get_args(Representation)), + model_name="luminous-base", + ) + + +def random_instructable_embed() -> EmbeddingConfig: + return InstructableEmbed( + model_name="pharia-1-embedding-4608-control", + query_instruction=random_alphanumeric_string(), + document_instruction=random_alphanumeric_string(), + ) + + +def random_embedding_config() -> EmbeddingConfig: + return random.choice([random_semantic_embed(), random_instructable_embed()]) + + +@fixture +def document_contents() -> DocumentContents: + text = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change. + +Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies. + +Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief. + +In the post-war years, Pemberton relocated to Atlanta, Georgia, where he continued to experiment with various medicinal syrups and tonics. It was during this time, in the late 19th century, that he developed a beverage he initially called "Pemberton's French Wine Coca." This concoction was inspired by Vin Mariani, a popular French tonic wine that contained coca leaves. Pemberton's beverage was intended to serve not just as a refreshing drink but also as a remedy for various ailments, including morphine addiction, indigestion, and headaches. + +However, in 1886, when Atlanta introduced prohibition legislation, Pemberton was compelled to create a non-alcoholic version of his beverage. He experimented with a combination of carbonated water, coca leaf extract, kola nut, and other ingredients, eventually perfecting the formula for what would soon become Coca-Cola. The name was suggested by his bookkeeper, Frank Robinson, who also created the distinctive cursive logo that is still in use today. + +Pemberton advertised his new creation as a "brain tonic" and "temperance drink," asserting that it could alleviate headaches and fatigue. However, due to his declining health and financial difficulties, Pemberton was eventually compelled to sell portions of his business to various partners. Shortly before his death in 1888, he sold his remaining stake in Coca-Cola to Asa G. Candler, a fellow pharmacist and businessman. + +Under Candler's leadership, Coca-Cola transformed from a pharmacist's concoction into a mass-produced and marketed beverage that became a staple of American culture and a global icon. Despite the changes and the immense growth of the brand, the legacy of John Stith Pemberton as the inventor of Coca-Cola remains an integral part of the beverage's history. + +Pemberton's life story is a testament to the spirit of innovation and resilience. His creation, borne out of personal struggles and the context of his times, went on to transcend its origins and become a symbol recognized across the globe. Today, when we think of Coca-Cola, we are reminded of Pemberton's journey from a small-town pharmacist to the creator of one of the world's most enduring and beloved brands.""" + return DocumentContents(contents=[text], metadata={"Some": "Metadata"}) + + +@fixture(scope="session") +def document_contents_with_metadata() -> list[DocumentContents]: + text_1 = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change.""" + text_2 = """Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies.""" + text_3 = """Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief.""" + + metadata_1: JsonSerializable = { + "string-field": "example_string_1", + "integer-field": 123, + "float-field": 123.45, + "boolean-field": True, + "date-field": datetime(2022, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + metadata_2: JsonSerializable = { + "string-field": "example_string_2", + "integer-field": 456, + "float-field": 678.90, + "boolean-field": False, + "date-field": datetime(2023, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + metadata_3: JsonSerializable = { + "string-field": "example_string_3", + "integer-field": 789, + "float-field": 101112.13, + "boolean-field": True, + "date-field": datetime(2024, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + return [ + DocumentContents(contents=[text_1], metadata=metadata_1), + DocumentContents(contents=[text_2], metadata=metadata_2), + DocumentContents(contents=[text_3], metadata=metadata_3), + ] + + +@fixture(scope="session") +def document_index_namespace(document_index: DocumentIndexClient) -> Iterable[str]: + yield "Search" + _teardown(document_index, "Search") + + +def _teardown( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[None]: + yield + + # Cleanup leftover resources from previous runs. + timestamp_threshold = datetime.now(timezone.utc) - timedelta(hours=1) + + collections = document_index.list_collections(document_index_namespace) + for collection_path in collections: + if is_outdated_identifier(collection_path.collection, timestamp_threshold): + document_index.delete_collection(collection_path) + + indexes = document_index.list_indexes(document_index_namespace) + for index_path in indexes: + if is_outdated_identifier(index_path.index, timestamp_threshold): + document_index.delete_index(index_path) + + filter_indexes = document_index.list_filter_indexes_in_namespace( + document_index_namespace + ) + for filter_index in filter_indexes: + if is_outdated_identifier(filter_index, timestamp_threshold): + document_index.delete_filter_index_from_namespace( + document_index_namespace, filter_index + ) + + +@fixture(scope="session") +def filter_index_configs() -> dict[str, dict[str, str]]: + return { + random_identifier(): { + "field-name": "string-field", + "field-type": "string", + }, + random_identifier(): { + "field-name": "integer-field", + "field-type": "integer", + }, + random_identifier(): { + "field-name": "float-field", + "field-type": "float", + }, + random_identifier(): { + "field-name": "boolean-field", + "field-type": "boolean", + }, + random_identifier(): { + "field-name": "date-field", + "field-type": "date_time", + }, + } + + +@contextmanager +def random_index_with_embedding_config( + document_index: DocumentIndexClient, + document_index_namespace: str, + embedding_config: EmbeddingConfig, +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + name = random_identifier() + + chunk_size, chunk_overlap = sorted( + random.sample([0, 32, 64, 128, 256, 512, 1024], 2), reverse=True + ) + + hybrid_index_choices: list[HybridIndex] = ["bm25", None] + hybrid_index = random.choice(hybrid_index_choices) + + index = IndexPath(namespace=document_index_namespace, index=name) + index_configuration = IndexConfiguration( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + hybrid_index=hybrid_index, + embedding=embedding_config, + ) + try: + document_index.create_index(index, index_configuration) + yield index, index_configuration + finally: + document_index.delete_index(index) + + +@fixture +def random_instructable_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, document_index_namespace, random_instructable_embed() + ) as index: + yield index + + +@fixture +def random_semantic_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, document_index_namespace, random_semantic_embed() + ) as index: + yield index + + +@fixture +def random_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, + document_index_namespace, + random.choice([random_semantic_embed(), random_instructable_embed()]), + ) as index: + yield index + + +@fixture +def random_collection( + document_index: DocumentIndexClient, + document_index_namespace: str, +) -> Iterator[CollectionPath]: + collection_name = random_identifier() + collection_path = CollectionPath( + namespace=document_index_namespace, collection=collection_name + ) + try: + document_index.create_collection(collection_path) + + yield collection_path + finally: + document_index.delete_collection(collection_path) + + +@fixture(scope="session") +def read_only_populated_collection( + document_index: DocumentIndexClient, + document_index_namespace: str, + document_contents_with_metadata: list[DocumentContents], + filter_index_configs: dict[str, dict[str, str]], +) -> Iterator[tuple[CollectionPath, IndexPath]]: + index_name = random_identifier() + index_path = IndexPath(namespace=document_index_namespace, index=index_name) + index_configuration = IndexConfiguration( + chunk_size=512, + chunk_overlap=0, + hybrid_index="bm25", + embedding=SemanticEmbed( + representation="asymmetric", + model_name="luminous-base", + ), + ) + + collection_name = random_identifier() + collection_path = CollectionPath( + namespace=document_index_namespace, collection=collection_name + ) + + try: + document_index.create_collection(collection_path) + document_index.create_index(index_path, index_configuration) + document_index.assign_index_to_collection(collection_path, index_name) + + for name, config in filter_index_configs.items(): + document_index.create_filter_index_in_namespace( + namespace=document_index_namespace, + filter_index_name=name, + field_name=config["field-name"], + field_type=config["field-type"], # type:ignore[arg-type] + ) + document_index.assign_filter_index_to_search_index( + collection_path=collection_path, + index_name=index_name, + filter_index_name=name, + ) + + for i, content in enumerate(document_contents_with_metadata): + document_index.add_document( + DocumentPath( + collection_path=collection_path, + document_name=f"document-{i}", + ), + content, + ) + + yield collection_path, index_path + finally: + document_index.delete_collection(collection_path) + + @retry + def clean_up_indexes() -> None: + document_index.delete_index(index_path) + for filter_index_name in filter_index_configs: + document_index.delete_filter_index_from_namespace( + document_index_namespace, filter_index_name + ) + + clean_up_indexes() + + +@fixture +def random_searchable_collection( + document_index: DocumentIndexClient, + document_contents_with_metadata: list[DocumentContents], + random_index: tuple[IndexPath, IndexConfiguration], + random_collection: CollectionPath, +) -> Iterator[tuple[CollectionPath, IndexPath]]: + index_path, _ = random_index + index_name = index_path.index + collection_path = random_collection + + try: + # Assign index + document_index.assign_index_to_collection(collection_path, index_name) + + # Add 3 documents + for i, content in enumerate(document_contents_with_metadata): + document_index.add_document( + DocumentPath( + collection_path=collection_path, + document_name=f"document-{i}", + ), + content, + ) + + # Ensure documents are searchable; this allows time for indexing + @retry + def search() -> None: + search_result = document_index.search( + collection_path, + index_name, + SearchQuery( + query="Coca-Cola", + ), + ) + assert len(search_result) > 0 + + search() + + yield collection_path, index_path + finally: + document_index.delete_collection(collection_path) + + @retry + def clean_up_index() -> None: + document_index.delete_index(index_path) + + clean_up_index() + + +# end document index setup @fixture def document_index_retriever( + random_searchable_collection: tuple[CollectionPath, IndexPath], document_index: DocumentIndexClient, ) -> DocumentIndexRetriever: return DocumentIndexRetriever( document_index, - index_name="asymmetric", - namespace="aleph-alpha", - collection="wikipedia-de", + index_name=random_searchable_collection[1].index, + namespace=random_searchable_collection[0].namespace, + collection=random_searchable_collection[0].collection, k=2, ) diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index ef7a4936d..db4a168ef 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -1,440 +1,26 @@ -import random -import re -import string -from collections.abc import Callable, Iterator -from contextlib import contextmanager -from datetime import datetime, timedelta, timezone -from functools import wraps +from datetime import datetime, timezone from http import HTTPStatus -from time import sleep -from typing import ParamSpec, TypeVar, get_args, overload import pytest from pydantic import ValidationError -from pytest import fixture, raises +from pytest import raises -from intelligence_layer.connectors.base.json_serializable import JsonSerializable from intelligence_layer.connectors.document_index.document_index import ( CollectionPath, DocumentContents, DocumentFilterQueryParams, DocumentIndexClient, DocumentPath, - EmbeddingConfig, FilterField, FilterOps, Filters, - HybridIndex, IndexConfiguration, IndexPath, - InstructableEmbed, InvalidInput, - Representation, ResourceNotFound, SearchQuery, - SemanticEmbed, ) - -P = ParamSpec("P") -R = TypeVar("R") - - -@overload -def retry( - func: None = None, max_retries: int = 3, seconds_delay: float = 0.0 -) -> Callable[[Callable[P, R]], Callable[P, R]]: ... - - -@overload -def retry( - func: Callable[P, R], max_retries: int = 3, seconds_delay: float = 0.0 -) -> Callable[P, R]: ... - - -def retry( - func: Callable[P, R] | None = None, - max_retries: int = 60, - seconds_delay: float = 0.5, -) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]: - def decorator(func: Callable[P, R]) -> Callable[P, R]: - @wraps(func) - def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: - for _ in range(1 + max_retries): - try: - return func(*args, **kwargs) - except Exception as e: - last_exception = e - sleep(seconds_delay) - - raise last_exception - - return wrapper - - if func is None: - return decorator - else: - return decorator(func) - - -def random_alphanumeric_string(length: int = 20) -> str: - return "".join(random.choices(string.ascii_letters + string.digits, k=length)) - - -def random_identifier() -> str: - name = random_alphanumeric_string(10) - timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") - return f"intelligence-layer-ci-{name}-{timestamp}" - - -def is_outdated_identifier(identifier: str, timestamp_threshold: datetime) -> bool: - # match the format that is defined in random_identifier() - matched = re.match( - r"^intelligence-layer-ci-[a-zA-Z0-9]{10}-(?P\d{8}T\d{6})$", - identifier, - ) - if matched is None: - return False - - timestamp = datetime.strptime(matched["timestamp"], "%Y%m%dT%H%M%S").replace( - tzinfo=timezone.utc - ) - return not timestamp > timestamp_threshold - - -def random_semantic_embed() -> EmbeddingConfig: - return SemanticEmbed( - representation=random.choice(get_args(Representation)), - model_name="luminous-base", - ) - - -def random_instructable_embed() -> EmbeddingConfig: - return InstructableEmbed( - model_name="pharia-1-embedding-4608-control", - query_instruction=random_alphanumeric_string(), - document_instruction=random_alphanumeric_string(), - ) - - -def random_embedding_config() -> EmbeddingConfig: - return random.choice([random_semantic_embed(), random_instructable_embed()]) - - -@fixture -def document_contents() -> DocumentContents: - text = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change. - -Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies. - -Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief. - -In the post-war years, Pemberton relocated to Atlanta, Georgia, where he continued to experiment with various medicinal syrups and tonics. It was during this time, in the late 19th century, that he developed a beverage he initially called "Pemberton's French Wine Coca." This concoction was inspired by Vin Mariani, a popular French tonic wine that contained coca leaves. Pemberton's beverage was intended to serve not just as a refreshing drink but also as a remedy for various ailments, including morphine addiction, indigestion, and headaches. - -However, in 1886, when Atlanta introduced prohibition legislation, Pemberton was compelled to create a non-alcoholic version of his beverage. He experimented with a combination of carbonated water, coca leaf extract, kola nut, and other ingredients, eventually perfecting the formula for what would soon become Coca-Cola. The name was suggested by his bookkeeper, Frank Robinson, who also created the distinctive cursive logo that is still in use today. - -Pemberton advertised his new creation as a "brain tonic" and "temperance drink," asserting that it could alleviate headaches and fatigue. However, due to his declining health and financial difficulties, Pemberton was eventually compelled to sell portions of his business to various partners. Shortly before his death in 1888, he sold his remaining stake in Coca-Cola to Asa G. Candler, a fellow pharmacist and businessman. - -Under Candler's leadership, Coca-Cola transformed from a pharmacist's concoction into a mass-produced and marketed beverage that became a staple of American culture and a global icon. Despite the changes and the immense growth of the brand, the legacy of John Stith Pemberton as the inventor of Coca-Cola remains an integral part of the beverage's history. - -Pemberton's life story is a testament to the spirit of innovation and resilience. His creation, borne out of personal struggles and the context of his times, went on to transcend its origins and become a symbol recognized across the globe. Today, when we think of Coca-Cola, we are reminded of Pemberton's journey from a small-town pharmacist to the creator of one of the world's most enduring and beloved brands.""" - return DocumentContents(contents=[text], metadata={"Some": "Metadata"}) - - -@fixture(scope="session") -def document_contents_with_metadata() -> list[DocumentContents]: - text_1 = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change.""" - text_2 = """Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies.""" - text_3 = """Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief.""" - - metadata_1: JsonSerializable = { - "string-field": "example_string_1", - "integer-field": 123, - "float-field": 123.45, - "boolean-field": True, - "date-field": datetime(2022, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - metadata_2: JsonSerializable = { - "string-field": "example_string_2", - "integer-field": 456, - "float-field": 678.90, - "boolean-field": False, - "date-field": datetime(2023, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - metadata_3: JsonSerializable = { - "string-field": "example_string_3", - "integer-field": 789, - "float-field": 101112.13, - "boolean-field": True, - "date-field": datetime(2024, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - return [ - DocumentContents(contents=[text_1], metadata=metadata_1), - DocumentContents(contents=[text_2], metadata=metadata_2), - DocumentContents(contents=[text_3], metadata=metadata_3), - ] - - -@fixture(scope="session") -def document_index_namespace() -> str: - return "Search" - - -@fixture(scope="session", autouse=True) -def _teardown( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[None]: - yield - - # Cleanup leftover resources from previous runs. - timestamp_threshold = datetime.now(timezone.utc) - timedelta(hours=1) - - collections = document_index.list_collections(document_index_namespace) - for collection_path in collections: - if is_outdated_identifier(collection_path.collection, timestamp_threshold): - document_index.delete_collection(collection_path) - - indexes = document_index.list_indexes(document_index_namespace) - for index_path in indexes: - if is_outdated_identifier(index_path.index, timestamp_threshold): - document_index.delete_index(index_path) - - filter_indexes = document_index.list_filter_indexes_in_namespace( - document_index_namespace - ) - for filter_index in filter_indexes: - if is_outdated_identifier(filter_index, timestamp_threshold): - document_index.delete_filter_index_from_namespace( - document_index_namespace, filter_index - ) - - -@fixture(scope="session") -def filter_index_configs() -> dict[str, dict[str, str]]: - return { - random_identifier(): { - "field-name": "string-field", - "field-type": "string", - }, - random_identifier(): { - "field-name": "integer-field", - "field-type": "integer", - }, - random_identifier(): { - "field-name": "float-field", - "field-type": "float", - }, - random_identifier(): { - "field-name": "boolean-field", - "field-type": "boolean", - }, - random_identifier(): { - "field-name": "date-field", - "field-type": "date_time", - }, - } - - -@contextmanager -def random_index_with_embedding_config( - document_index: DocumentIndexClient, - document_index_namespace: str, - embedding_config: EmbeddingConfig, -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - name = random_identifier() - - chunk_size, chunk_overlap = sorted( - random.sample([0, 32, 64, 128, 256, 512, 1024], 2), reverse=True - ) - - hybrid_index_choices: list[HybridIndex] = ["bm25", None] - hybrid_index = random.choice(hybrid_index_choices) - - index = IndexPath(namespace=document_index_namespace, index=name) - index_configuration = IndexConfiguration( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - hybrid_index=hybrid_index, - embedding=embedding_config, - ) - try: - document_index.create_index(index, index_configuration) - yield index, index_configuration - finally: - document_index.delete_index(index) - - -@fixture -def random_instructable_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, document_index_namespace, random_instructable_embed() - ) as index: - yield index - - -@fixture -def random_semantic_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, document_index_namespace, random_semantic_embed() - ) as index: - yield index - - -@fixture -def random_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, - document_index_namespace, - random.choice([random_semantic_embed(), random_instructable_embed()]), - ) as index: - yield index - - -@fixture -def random_collection( - document_index: DocumentIndexClient, - document_index_namespace: str, -) -> Iterator[CollectionPath]: - collection_name = random_identifier() - collection_path = CollectionPath( - namespace=document_index_namespace, collection=collection_name - ) - try: - document_index.create_collection(collection_path) - - yield collection_path - finally: - document_index.delete_collection(collection_path) - - -@fixture(scope="session") -def read_only_populated_collection( - document_index: DocumentIndexClient, - document_index_namespace: str, - document_contents_with_metadata: list[DocumentContents], - filter_index_configs: dict[str, dict[str, str]], -) -> Iterator[tuple[CollectionPath, IndexPath]]: - index_name = random_identifier() - index_path = IndexPath(namespace=document_index_namespace, index=index_name) - index_configuration = IndexConfiguration( - chunk_size=512, - chunk_overlap=0, - hybrid_index="bm25", - embedding=SemanticEmbed( - representation="asymmetric", - model_name="luminous-base", - ), - ) - - collection_name = random_identifier() - collection_path = CollectionPath( - namespace=document_index_namespace, collection=collection_name - ) - - try: - document_index.create_collection(collection_path) - document_index.create_index(index_path, index_configuration) - document_index.assign_index_to_collection(collection_path, index_name) - - for name, config in filter_index_configs.items(): - document_index.create_filter_index_in_namespace( - namespace=document_index_namespace, - filter_index_name=name, - field_name=config["field-name"], - field_type=config["field-type"], # type:ignore[arg-type] - ) - document_index.assign_filter_index_to_search_index( - collection_path=collection_path, - index_name=index_name, - filter_index_name=name, - ) - - for i, content in enumerate(document_contents_with_metadata): - document_index.add_document( - DocumentPath( - collection_path=collection_path, - document_name=f"document-{i}", - ), - content, - ) - - yield collection_path, index_path - finally: - document_index.delete_collection(collection_path) - - @retry - def clean_up_indexes() -> None: - document_index.delete_index(index_path) - for filter_index_name in filter_index_configs: - document_index.delete_filter_index_from_namespace( - document_index_namespace, filter_index_name - ) - - clean_up_indexes() - - -@fixture -def random_searchable_collection( - document_index: DocumentIndexClient, - document_contents_with_metadata: list[DocumentContents], - random_index: tuple[IndexPath, IndexConfiguration], - random_collection: CollectionPath, -) -> Iterator[tuple[CollectionPath, IndexPath]]: - index_path, _ = random_index - index_name = index_path.index - collection_path = random_collection - - try: - # Assign index - document_index.assign_index_to_collection(collection_path, index_name) - - # Add 3 documents - for i, content in enumerate(document_contents_with_metadata): - document_index.add_document( - DocumentPath( - collection_path=collection_path, - document_name=f"document-{i}", - ), - content, - ) - - # Ensure documents are searchable; this allows time for indexing - @retry - def search() -> None: - search_result = document_index.search( - collection_path, - index_name, - SearchQuery( - query="Coca-Cola", - ), - ) - assert len(search_result) > 0 - - search() - - yield collection_path, index_path - finally: - document_index.delete_collection(collection_path) - - @retry - def clean_up_index() -> None: - document_index.delete_index(index_path) - - clean_up_index() +from tests.conftest import random_embedding_config, retry @pytest.mark.internal diff --git a/tests/connectors/retrievers/test_document_index_retriever.py b/tests/connectors/retrievers/test_document_index_retriever.py index c9f6ca10c..575d0424b 100644 --- a/tests/connectors/retrievers/test_document_index_retriever.py +++ b/tests/connectors/retrievers/test_document_index_retriever.py @@ -4,20 +4,12 @@ DocumentIndexRetriever, ) -QUERY = "Who likes pizza?" -TEXTS = [ - "Gegenwart \nDurch italienische Auswanderer verbreitete sich die Pizza gegen Ende des 19. Jahrhunderts auch in den USA. Im Oktober 1937 wurde in Frankfurt am Main erstmals eine Pizza auf dem damaligen Festhallengelände im Rahmen der 7. Internationalen Kochkunst-Ausstellung bei der Messe Frankfurt zubereitet. Nach dem Zweiten Weltkrieg wurde Pizza auch in Europa außerhalb Italiens bekannter. Die erste Pizzeria in Deutschland wurde von Nicolino di Camillo (1921–2015) im März 1952 in Würzburg unter dem Namen Sabbie di Capri eröffnet. Von hier aus begann der Siegeszug der Pizza in Deutschland. Die erste Pizzeria in Wien wurde 1975 von Pasquale Tavella eröffnet. Neben Spaghetti ist die Pizza heute das bekannteste italienische Nationalgericht, sie wird weltweit angeboten.\n\nZubereitung \nZur Zubereitung wird zuerst ein einfacher Hefeteig aus Mehl, Wasser, wenig Hefe, Salz und eventuell etwas Olivenöl hergestellt, gründlich durchgeknetet und nach einer Gehzeit von mindestens einer Stunde bei Zimmertemperatur (bzw. über Nacht im oberen Fach des Kühlschranks) ausgerollt oder mit den bemehlten Händen dünn ausgezogen. Geübte Pizzabäcker ziehen den Teig über den Handrücken und weiten ihn durch Kreisenlassen in der Luft.\n\nDann wird der Teig mit den Zutaten je nach Rezept nicht zu üppig belegt, meist mit passierten Dosentomaten oder Salsa pizzaiola (einer vorher gekochten, sämigen Tomatensauce, die mit Oregano, Basilikum, Knoblauch und anderem kräftig gewürzt ist). Es folgen der Käse (z. B. Mozzarella, Parmesan oder Pecorino) und die übrigen Zutaten, zum Abschluss etwas Olivenöl.\n\nSchließlich wird die Pizza bei einer möglichst hohen Temperatur von 400 bis 500 °C für wenige Minuten kurz gebacken. Dies geschieht in einer möglichst niedrigen Kammer. Ein Stapeln in Einschüben oder separat schaltbare Unter- und Oberhitze ist daher nicht üblich. Der traditionelle Kuppelofen ist gemauert und die Hitze wird über ein Feuer direkt im Backraum erzeugt. Moderne Pizzaöfen werden mit Gas oder Strom beheizt.", - "Verbreitet in Italien ist auch die Pizza bianca (weiße Pizza), jegliche Pizza-Variation, die ohne Tomatensoße zubereitet wird.\n\nEine Calzone (italienisch für „Hose“) ist eine Pizza, bei welcher der Teigfladen vor dem Backen über dem Belag zusammengeklappt wird. Die traditionelle Füllung besteht aus Ricotta, rohem Schinken, Pilzen, Mozzarella, Parmesan und Oregano. Ursprünglich wurde die Calzone nicht im Ofen, sondern in einer Pfanne in Schmalz oder Öl gebacken, wie es als Pizza fritta in Neapel üblich ist.\n\nIn ganz Italien verbreitet ist die Pizza al taglio („Pizza am Stück“), die auf einem rechteckigen Blech gebacken und in kleineren rechteckigen Stücken verkauft wird. Angeboten wird sie häufig nicht nur in Pizzerien, sondern auch beim Bäcker.\n\nEine neuartige Abwandlung der Pizza ist die Pinsa, die rechteckig und aus einem lockeren Teig gebacken wird.\n\nUS-amerikanische Pizza \nIn den USA sind zwei Typen weit verbreitet, „Chicago-style“ und „New York-style“ Pizza. Während die New Yorker Variante mit ihrem sehr dünnen Boden der italienischen Variante ähnelt, steht die Variante aus Chicago Kopf: Der Teig bildet eine Schüsselform, wird mit Mozzarellascheiben ausgelegt und mit weiteren Zutaten gefüllt. Zum Schluss wird das ganze von oben mit zerkleinerten Tomaten bestrichen und mit Parmesan und Oregano bestreut.\n\nAuch die Pizza Hawaii mit Kochschinken und Ananas ist wahrscheinlich nordamerikanischen Ursprungs.\n\nIn Deutschland ist eine weitere Variante als „American Pizza“ populär, die sich vor allem durch einen dicken, luftigen Boden auszeichnet und u. a. durch die Restaurantkette Pizza Hut bekannt ist.\n\nKoschere Pizza", -] - @pytest.mark.internal def test_document_index_retriever( document_index_retriever: DocumentIndexRetriever, ) -> None: - documents = document_index_retriever.get_relevant_documents_with_scores(QUERY) - assert documents[0].document_chunk.text[0:30] in TEXTS[0] - assert documents[1].document_chunk.text[0:30] in TEXTS[1] - document_path = documents[0].id - assert document_path.collection_path == document_index_retriever._collection_path - assert document_path.document_name == "Pizza" + documents = document_index_retriever.get_relevant_documents_with_scores( + "Who took part in the war?" + ) + assert len(documents) == 2 diff --git a/tests/examples/qa/test_retriever_based_qa.py b/tests/examples/qa/test_retriever_based_qa.py index a67633e1a..ad8864161 100644 --- a/tests/examples/qa/test_retriever_based_qa.py +++ b/tests/examples/qa/test_retriever_based_qa.py @@ -46,16 +46,3 @@ def test_retriever_based_qa_using_in_memory_retriever( assert output.answer assert "1888" in output.answer assert output.subanswers[0].id == 3 - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_retriever_based_qa_with_document_index( - retriever_based_qa_with_document_index: RetrieverBasedQa[DocumentPath], - no_op_tracer: NoOpTracer, -) -> None: - question = "When was Robert Moses born?" - input = RetrieverBasedQaInput(question=question) - output = retriever_based_qa_with_document_index.run(input, no_op_tracer) - assert output.answer - assert "1888" in output.answer - assert output.subanswers[0].id.document_name == "Robert Moses (Begriffsklärung)" diff --git a/tests/examples/search/test_expand_chunk.py b/tests/examples/search/test_expand_chunk.py index 2840bcef8..56b5ddbfd 100644 --- a/tests/examples/search/test_expand_chunk.py +++ b/tests/examples/search/test_expand_chunk.py @@ -8,11 +8,15 @@ BaseRetriever, Document, DocumentChunk, - DocumentIndexRetriever, - DocumentPath, QdrantInMemoryRetriever, SearchResult, ) +from intelligence_layer.connectors.limited_concurrency_client import ( + AlephAlphaClientProtocol, +) +from intelligence_layer.connectors.retrievers.qdrant_in_memory_retriever import ( + RetrieverType, +) from intelligence_layer.core import LuminousControlModel, NoOpTracer from intelligence_layer.examples import ExpandChunks, ExpandChunksInput @@ -177,26 +181,27 @@ def test_expand_chunk_works_for_multiple_chunks( def test_expand_chunk_is_fast_with_large_document( - document_index_retriever: DocumentIndexRetriever, + client: AlephAlphaClientProtocol, luminous_control_model: LuminousControlModel, no_op_tracer: NoOpTracer, ) -> None: + retriever = QdrantInMemoryRetriever( + [Document(text="""test text\n""" * 100)], + client=client, + k=2, + retriever_type=RetrieverType.ASYMMETRIC, + ) expand_chunk_input = ExpandChunksInput( - document_id=DocumentPath( - collection_path=document_index_retriever._collection_path, - document_name="Chronik der COVID-19-Pandemie in den Vereinigten Staaten 2020", - ), + document_id=0, chunks_found=[ DocumentChunk( - text="", - start=0, - end=50, + text="test text\n" * 10, + start=50, + end=60, ) ], ) - expand_chunk_task = ExpandChunks( - document_index_retriever, luminous_control_model, 256 - ) + expand_chunk_task = ExpandChunks(retriever, luminous_control_model, 256) time = datetime.now() output = expand_chunk_task.run(expand_chunk_input, no_op_tracer) diff --git a/tests/examples/summarize/test_recursive_summarize.py b/tests/examples/summarize/test_recursive_summarize.py index 7c92b30f9..fc56dfe2a 100644 --- a/tests/examples/summarize/test_recursive_summarize.py +++ b/tests/examples/summarize/test_recursive_summarize.py @@ -1,4 +1,3 @@ -import os from pathlib import Path from aleph_alpha_client import Client, CompletionRequest, CompletionResponse @@ -29,10 +28,10 @@ def complete(self, request: CompletionRequest, model: str) -> CompletionResponse @fixture -def recursive_counting_client() -> RecursiveCountingClient: - aa_token = os.getenv("AA_TOKEN") - assert aa_token - return RecursiveCountingClient(aa_token) +def recursive_counting_client( + token: str, inference_url: str +) -> RecursiveCountingClient: + return RecursiveCountingClient(token, host=inference_url) @fixture From f1d6cf60dfa68a4d84008b7a4963149c79361b24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Thu, 12 Dec 2024 09:08:09 +0100 Subject: [PATCH 08/15] fix: notebooks and env in ci --- .env.example | 1 + .github/workflows/sdk-tests.yml | 8 ++--- docker-compose.yaml | 3 +- src/documentation/document_index.ipynb | 10 +++--- src/documentation/elo_qa_eval.ipynb | 13 +++----- src/documentation/evaluate_with_studio.ipynb | 18 ++++------- .../how_tos/how_to_implement_a_task.ipynb | 32 ++++++++++--------- .../studio/how_to_execute_a_benchmark.ipynb | 2 +- src/documentation/qa.ipynb | 8 ++--- .../connectors/limited_concurrency_client.py | 6 ++-- 10 files changed, 49 insertions(+), 52 deletions(-) diff --git a/.env.example b/.env.example index a7f982c22..fb3ccf8dc 100644 --- a/.env.example +++ b/.env.example @@ -14,6 +14,7 @@ POSTGRES_PASSWORD=test # things to adapt CLIENT_URL=... +AUTHORIZATION_SERVICE_URL=... AA_TOKEN=token DATA_SERVICE_URL=... DOCUMENT_INDEX_URL=... diff --git a/.github/workflows/sdk-tests.yml b/.github/workflows/sdk-tests.yml index b0358e6eb..7f0a44c04 100644 --- a/.github/workflows/sdk-tests.yml +++ b/.github/workflows/sdk-tests.yml @@ -147,9 +147,9 @@ jobs: POSTGRES_DB: "il_sdk" POSTGRES_USER: "il_sdk" POSTGRES_PASSWORD: "test" - AUTHORIZATION_SERVICE_URL: "none" + AUTHORIZATION_SERVICE_URL: ${{ secrets.AUTHORIZATION_SERVICE_URL }} AA_TOKEN: ${{ secrets.AA_TOKEN }} - API_SCHEDULER_URL: "https://api.aleph-alpha.com" + API_SCHEDULER_URL: ${{ secrets.CLIENT_URL }} DATA_SERVICE_URL: ${{secrets.DATA_SERVICE_URL}} credentials: username: "unused" @@ -235,9 +235,9 @@ jobs: POSTGRES_DB: "il_sdk" POSTGRES_USER: "il_sdk" POSTGRES_PASSWORD: "test" - AUTHORIZATION_SERVICE_URL: "none" + AUTHORIZATION_SERVICE_URL: ${{ secrets.AUTHORIZATION_SERVICE_URL }} AA_TOKEN: ${{ secrets.AA_TOKEN }} - API_SCHEDULER_URL: "https://api.aleph-alpha.com" + API_SCHEDULER_URL: ${{ secrets.CLIENT_URL }} DATA_SERVICE_URL: ${{secrets.DATA_SERVICE_URL}} credentials: username: "unused" diff --git a/docker-compose.yaml b/docker-compose.yaml index 161089f9a..67ac0f24a 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -71,8 +71,7 @@ services: env_file: ".env" # mainly for AA-TOKEN, DB User/PW environment: POSTGRES_HOST: postgres - AUTHORIZATION_SERVICE_URL: "none" - API_SCHEDULER_URL: "https://api.aleph-alpha.com" + API_SCHEDULER_URL: ${CLIENT_URL} postgres: image: postgres:15 ports: diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb index 8c4f1b97e..aae97a93f 100644 --- a/src/documentation/document_index.ipynb +++ b/src/documentation/document_index.ipynb @@ -25,7 +25,7 @@ " LimitedConcurrencyClient,\n", " SemanticEmbed,\n", ")\n", - "from intelligence_layer.core import InMemoryTracer\n", + "from intelligence_layer.core import InMemoryTracer, LuminousControlModel\n", "from intelligence_layer.examples import MultipleChunkRetrieverQa, RetrieverBasedQaInput\n", "\n", "load_dotenv()" @@ -628,7 +628,9 @@ "outputs": [], "source": [ "client = LimitedConcurrencyClient.from_env()\n", - "retriever_qa = MultipleChunkRetrieverQa(document_index_retriever, insert_chunk_number=3)\n", + "retriever_qa = MultipleChunkRetrieverQa(\n", + " document_index_retriever, insert_chunk_number=3, model=LuminousControlModel()\n", + ")\n", "\n", "\n", "input = RetrieverBasedQaInput(\n", @@ -661,7 +663,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -675,7 +677,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/documentation/elo_qa_eval.ipynb b/src/documentation/elo_qa_eval.ipynb index 920f6e012..ef2b2c55e 100644 --- a/src/documentation/elo_qa_eval.ipynb +++ b/src/documentation/elo_qa_eval.ipynb @@ -27,9 +27,6 @@ "metadata": {}, "outputs": [], "source": [ - "from os import getenv\n", - "\n", - "from aleph_alpha_client import Client\n", "from dotenv import load_dotenv\n", "\n", "from intelligence_layer.connectors import LimitedConcurrencyClient\n", @@ -56,8 +53,7 @@ "\n", "load_dotenv()\n", "\n", - "aa_client = Client(getenv(\"AA_TOKEN\"))\n", - "limited_concurrency_client = LimitedConcurrencyClient(aa_client, max_retry_time=60)" + "aa_client = limited_concurrency_client = LimitedConcurrencyClient.from_env()" ] }, { @@ -205,7 +201,7 @@ "source": [ "models = [\n", " LuminousControlModel(name=\"luminous-base-control\", client=aa_client),\n", - " LuminousControlModel(name=\"luminous-supreme-control\", client=aa_client),\n", + " Llama3InstructModel(name=\"llama-3.1-8b-instruct\", client=aa_client),\n", "]\n", "\n", "for model in models:\n", @@ -292,6 +288,8 @@ "metadata": {}, "outputs": [], "source": [ + "# Here we evaluate with the same model as we want to evaluate for the evaluation.\n", + "# This includes a significant bias and is generally less recommended.\n", "elo_qa_evaluation_logic = EloQaEvaluationLogic(\n", " model=Llama3InstructModel(name=\"llama-3.1-8b-instruct\")\n", ")\n", @@ -450,8 +448,7 @@ "outputs": [], "source": [ "newly_added_models = [\n", - " LuminousControlModel(name=\"luminous-base-control-20230501\", client=aa_client),\n", - " LuminousControlModel(name=\"luminous-supreme-control-20230501\", client=aa_client),\n", + " Llama3InstructModel(name=\"llama-3.1-70b-instruct\", client=aa_client),\n", "]\n", "\n", "for model in newly_added_models:\n", diff --git a/src/documentation/evaluate_with_studio.ipynb b/src/documentation/evaluate_with_studio.ipynb index 06261c588..2773e5f82 100644 --- a/src/documentation/evaluate_with_studio.ipynb +++ b/src/documentation/evaluate_with_studio.ipynb @@ -84,13 +84,6 @@ "Therefore, let's check out what it looks like." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -126,14 +119,17 @@ "metadata": {}, "outputs": [], "source": [ - "all_labels = list(set(item[\"label\"] for item in data))\n", + "# we grab only a subset of the data here to speed up the evaluation. Remove the index to run on all example datapoints.\n", + "subset_of_data = data[:5]\n", + "\n", + "all_labels = list(set(item[\"label\"] for item in subset_of_data))\n", "dataset = studio_dataset_repository.create_dataset(\n", " examples=[\n", " Example(\n", " input=ClassifyInput(chunk=TextChunk(item[\"message\"]), labels=all_labels),\n", " expected_output=item[\"label\"],\n", " )\n", - " for item in data\n", + " for item in subset_of_data\n", " ],\n", " dataset_name=\"Single Label Classify Dataset\",\n", ")\n", @@ -281,7 +277,7 @@ ], "metadata": { "kernelspec": { - "display_name": "intelligence-layer-ZqHLMTHE-py3.12", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -295,7 +291,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/how_to_implement_a_task.ipynb b/src/documentation/how_tos/how_to_implement_a_task.ipynb index 54c7228cc..e4dee160a 100644 --- a/src/documentation/how_tos/how_to_implement_a_task.ipynb +++ b/src/documentation/how_tos/how_to_implement_a_task.ipynb @@ -11,8 +11,9 @@ "\n", "from intelligence_layer.core import (\n", " CompleteInput,\n", - " LuminousControlModel,\n", + " ControlModel,\n", " NoOpTracer,\n", + " Pharia1ChatModel,\n", " Task,\n", " TaskSpan,\n", ")\n", @@ -62,7 +63,8 @@ " joke: str\n", "\n", "\n", - "# Step 1 - we want a control model but do not care otherwise. Therefore we use the default.\n", + "# Step 1 - we want a control model but do not care otherwise. Therefore we use the default. For our case, the Chat models also work.\n", + "model_to_use = Pharia1ChatModel()\n", "\n", "\n", "# Step 2\n", @@ -70,8 +72,8 @@ " PROMPT_TEMPLATE: str = \"\"\"Tell me a joke about the following topic:\"\"\"\n", "\n", " # Step 2.1\n", - " def __init__(self, model: LuminousControlModel | None = None) -> None:\n", - " self._model = model if model else LuminousControlModel()\n", + " def __init__(self, model: ControlModel | None = None) -> None:\n", + " self._model = model if model else Pharia1ChatModel()\n", "\n", " # Step 2.2\n", " def do_run(\n", @@ -85,7 +87,9 @@ " return TellAJokeTaskOutput(joke=completion.completions[0].completion)\n", "\n", "\n", - "TellAJokeTask().run(TellAJokeTaskInput(topic=\"Software Engineers\"), NoOpTracer())" + "TellAJokeTask(model=model_to_use).run(\n", + " TellAJokeTaskInput(topic=\"Software Engineers\"), NoOpTracer()\n", + ")" ] }, { @@ -109,6 +113,9 @@ "metadata": {}, "outputs": [], "source": [ + "from intelligence_layer.core.model import LuminousControlModel\n", + "\n", + "\n", "class PeopleExtractorInput(BaseModel):\n", " text_passage: str\n", "\n", @@ -142,20 +149,15 @@ "task_input = PeopleExtractorInput(\n", " text_passage=\"Peter ate Sarahs Lunch, their teacher Mr. Meyers was very angry with him.'\"\n", ")\n", - "PeopleExtractor().run(task_input, NoOpTracer()).answer" + "PeopleExtractor(task=SingleChunkQa(model=LuminousControlModel())).run(\n", + " task_input, NoOpTracer()\n", + ").answer" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -169,7 +171,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb index 3813bdd52..5d96397f0 100644 --- a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb +++ b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb @@ -91,7 +91,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, diff --git a/src/documentation/qa.ipynb b/src/documentation/qa.ipynb index a61dcbb30..6e0701be7 100644 --- a/src/documentation/qa.ipynb +++ b/src/documentation/qa.ipynb @@ -97,7 +97,7 @@ "input = SingleChunkQaInput(chunk=text, question=question, generate_highlights=True)\n", "\n", "# Define a LuminousControlModel and instantiate a SingleChunkQa task\n", - "model = LuminousControlModel(name=\"luminous-supreme-control\")\n", + "model = LuminousControlModel(name=\"luminous-base-control\")\n", "single_chunk_qa = SingleChunkQa(model=model)\n", "\n", "output = single_chunk_qa.run(input, NoOpTracer())\n", @@ -369,7 +369,7 @@ "question = \"What is the name of the book about Robert Moses?\"\n", "input = LongContextQaInput(text=long_text, question=question)\n", "\n", - "long_context_qa = LongContextQa()\n", + "long_context_qa = LongContextQa(model=model)\n", "tracer = InMemoryTracer()\n", "output = long_context_qa.run(input, tracer=tracer)" ] @@ -406,7 +406,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -420,7 +420,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/intelligence_layer/connectors/limited_concurrency_client.py b/src/intelligence_layer/connectors/limited_concurrency_client.py index 1653d6d5f..9ffa74b5f 100644 --- a/src/intelligence_layer/connectors/limited_concurrency_client.py +++ b/src/intelligence_layer/connectors/limited_concurrency_client.py @@ -142,9 +142,9 @@ def from_env( assert token, "Define environment variable AA_TOKEN with a valid token for the Aleph Alpha API" if host is None: host = getenv("CLIENT_URL") - if not host: - host = "https://api.aleph-alpha.com" - print(f"No CLIENT_URL specified in environment, using default: {host}.") + assert ( + host + ), "Define CLIENT_URL with a valid url pointing towards your inference API." return cls(Client(token, host=host)) From 8a08c2a44b96ce35dd25f0d4ef29374cbffe515f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Mon, 16 Dec 2024 08:33:17 +0100 Subject: [PATCH 09/15] fix: doctests by removing examples and changing models --- .../document_index/document_index.py | 30 ------------------- .../examples/qa/long_context_qa.py | 6 ++-- .../examples/qa/multiple_chunk_qa.py | 6 ++-- .../examples/qa/retriever_based_qa.py | 16 ---------- .../examples/qa/single_chunk_qa.py | 7 ++--- .../examples/search/search.py | 19 ------------ 6 files changed, 9 insertions(+), 75 deletions(-) diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index 5b83a5acc..1ee8539a8 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -438,36 +438,6 @@ class DocumentIndexClient: Args: token: A valid token for the document index API. base_document_index_url: The url of the document index' API. - - Example: - >>> import os - - >>> from intelligence_layer.connectors import ( - ... CollectionPath, - ... DocumentContents, - ... DocumentIndexClient, - ... DocumentPath, - ... SearchQuery, - ... ) - - >>> document_index = DocumentIndexClient(os.getenv("AA_TOKEN")) - >>> collection_path = CollectionPath( - ... namespace="aleph-alpha", collection="wikipedia-de" - ... ) - >>> document_index.create_collection(collection_path) - >>> document_index.add_document( - ... document_path=DocumentPath( - ... collection_path=collection_path, document_name="Fun facts about Germany" - ... ), - ... contents=DocumentContents.from_text("Germany is a country located in ..."), - ... ) - >>> search_result = document_index.search( - ... collection_path=collection_path, - ... index_name="asymmetric", - ... search_query=SearchQuery( - ... query="What is the capital of Germany", max_results=4, min_score=0.5 - ... ), - ... ) """ def __init__( diff --git a/src/intelligence_layer/examples/qa/long_context_qa.py b/src/intelligence_layer/examples/qa/long_context_qa.py index b7f401a95..80de61a21 100644 --- a/src/intelligence_layer/examples/qa/long_context_qa.py +++ b/src/intelligence_layer/examples/qa/long_context_qa.py @@ -55,11 +55,11 @@ class LongContextQa(Task[LongContextQaInput, MultipleChunkQaOutput]): model: The model used in the task. Example: - >>> from intelligence_layer.core import InMemoryTracer + >>> from intelligence_layer.core import InMemoryTracer, LuminousControlModel >>> from intelligence_layer.examples import LongContextQa, LongContextQaInput - - >>> task = LongContextQa() + >>> model = LuminousControlModel("luminous-base-control") + >>> task = LongContextQa(model=model) >>> input = LongContextQaInput(text="Lengthy text goes here...", ... question="Where does the text go?") >>> tracer = InMemoryTracer() diff --git a/src/intelligence_layer/examples/qa/multiple_chunk_qa.py b/src/intelligence_layer/examples/qa/multiple_chunk_qa.py index af5124d22..f31eea047 100644 --- a/src/intelligence_layer/examples/qa/multiple_chunk_qa.py +++ b/src/intelligence_layer/examples/qa/multiple_chunk_qa.py @@ -141,15 +141,15 @@ class MultipleChunkQa(Task[MultipleChunkQaInput, MultipleChunkQaOutput]): >>> from intelligence_layer.connectors import ( ... LimitedConcurrencyClient, ... ) - >>> from intelligence_layer.core import Language, InMemoryTracer + >>> from intelligence_layer.core import Language, InMemoryTracer, LuminousControlModel >>> from intelligence_layer.core.chunk import TextChunk >>> from intelligence_layer.examples import ( ... MultipleChunkQa, ... MultipleChunkQaInput, ... ) - - >>> task = MultipleChunkQa() + >>> model = LuminousControlModel("luminous-base-control") + >>> task = MultipleChunkQa(merge_answers_model=model) >>> input = MultipleChunkQaInput( ... chunks=[TextChunk("Tina does not like pizza."), TextChunk("Mike is a big fan of pizza.")], ... question="Who likes pizza?", diff --git a/src/intelligence_layer/examples/qa/retriever_based_qa.py b/src/intelligence_layer/examples/qa/retriever_based_qa.py index 55079e929..145249e88 100644 --- a/src/intelligence_layer/examples/qa/retriever_based_qa.py +++ b/src/intelligence_layer/examples/qa/retriever_based_qa.py @@ -71,22 +71,6 @@ class RetrieverBasedQa( retriever: Used to access and return a set of texts. multi_chunk_qa: The task that is used to generate an answer for a single chunk (retrieved through the retriever). Defaults to :class:`MultipleChunkQa` . - - Example: - >>> import os - >>> from intelligence_layer.connectors import DocumentIndexClient - >>> from intelligence_layer.connectors import DocumentIndexRetriever - >>> from intelligence_layer.core import InMemoryTracer - >>> from intelligence_layer.examples import RetrieverBasedQa, RetrieverBasedQaInput - - - >>> token = os.getenv("AA_TOKEN") - >>> document_index = DocumentIndexClient(token) - >>> retriever = DocumentIndexRetriever(document_index, "asymmetric", "aleph-alpha", "wikipedia-de", 3) - >>> task = RetrieverBasedQa(retriever) - >>> input_data = RetrieverBasedQaInput(question="When was Rome founded?") - >>> tracer = InMemoryTracer() - >>> output = task.run(input_data, tracer) """ def __init__( diff --git a/src/intelligence_layer/examples/qa/single_chunk_qa.py b/src/intelligence_layer/examples/qa/single_chunk_qa.py index 26fc205a3..8f18bf4b5 100644 --- a/src/intelligence_layer/examples/qa/single_chunk_qa.py +++ b/src/intelligence_layer/examples/qa/single_chunk_qa.py @@ -104,11 +104,10 @@ class SingleChunkQa(Task[SingleChunkQaInput, SingleChunkQaOutput]): Example: >>> import os - >>> from intelligence_layer.core import Language, InMemoryTracer - >>> from intelligence_layer.core import TextChunk + >>> from intelligence_layer.core import Language, InMemoryTracer, TextChunk, LuminousControlModel >>> from intelligence_layer.examples import SingleChunkQa, SingleChunkQaInput - >>> - >>> task = SingleChunkQa() + >>> model = LuminousControlModel("luminous-base-control") + >>> task = SingleChunkQa(model=model) >>> input = SingleChunkQaInput( ... chunk=TextChunk("Tina does not like pizza. However, Mike does."), ... question="Who likes pizza?", diff --git a/src/intelligence_layer/examples/search/search.py b/src/intelligence_layer/examples/search/search.py index babeac927..148a592a6 100644 --- a/src/intelligence_layer/examples/search/search.py +++ b/src/intelligence_layer/examples/search/search.py @@ -46,25 +46,6 @@ class Search(Generic[ID], Task[SearchInput, SearchOutput[ID]]): Args: retriever: Implements logic to retrieve matching texts to the query. - - Example: - >>> from os import getenv - >>> from intelligence_layer.connectors import ( - ... DocumentIndexClient, - ... ) - >>> from intelligence_layer.connectors import ( - ... DocumentIndexRetriever, - ... ) - >>> from intelligence_layer.core import InMemoryTracer - >>> from intelligence_layer.examples import Search, SearchInput - - - >>> document_index = DocumentIndexClient(getenv("AA_TOKEN")) - >>> retriever = DocumentIndexRetriever(document_index, "asymmetric", "aleph-alpha", "wikipedia-de", 3) - >>> task = Search(retriever) - >>> input = SearchInput(query="When did East and West Germany reunite?") - >>> tracer = InMemoryTracer() - >>> output = task.run(input, tracer) """ def __init__(self, retriever: BaseRetriever[ID]): From 707c8bee797a695eca93a725518a52c63b250a88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Mon, 16 Dec 2024 08:46:04 +0100 Subject: [PATCH 10/15] fix: expose document index url in CI steps --- .github/workflows/sdk-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/sdk-tests.yml b/.github/workflows/sdk-tests.yml index 7f0a44c04..3e77f8c6b 100644 --- a/.github/workflows/sdk-tests.yml +++ b/.github/workflows/sdk-tests.yml @@ -190,6 +190,7 @@ jobs: ARGILLA_API_KEY: "argilla.apikey" CLIENT_URL: ${{ secrets.CLIENT_URL }} STUDIO_URL: "http://localhost:8000/" + DOCUMENT_INDEX_URL: ${{secrets.DOCUMENT_INDEX_URL}} POSTGRES_HOST: "localhost" POSTGRES_PORT: "5433" POSTGRES_DB: "il_sdk" @@ -274,5 +275,6 @@ jobs: ARGILLA_API_KEY: "argilla.apikey" CLIENT_URL: ${{ secrets.CLIENT_URL }} STUDIO_URL: "http://localhost:8001" + DOCUMENT_INDEX_URL: ${{secrets.DOCUMENT_INDEX_URL}} run: | ./scripts/notebook_runner.sh From 31e5afba46d0d1ef15d0ec29cbf037f44f1eea16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Mon, 16 Dec 2024 09:20:48 +0100 Subject: [PATCH 11/15] fix: fastapi example having hardcoded url --- src/documentation/fastapi_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/documentation/fastapi_example.py b/src/documentation/fastapi_example.py index 8459fed89..cbf001ff5 100644 --- a/src/documentation/fastapi_example.py +++ b/src/documentation/fastapi_example.py @@ -65,7 +65,7 @@ def __call__( def client() -> Client: return Client( token=os.environ["AA_TOKEN"], - host=os.getenv("AA_CLIENT_BASE_URL", "https://api.aleph-alpha.com"), + host=os.environ["CLIENT_URL"], ) @@ -78,7 +78,7 @@ def default_model( def summary_task( model: Annotated[LuminousControlModel, Depends(default_model)], ) -> SteerableSingleChunkSummarize: - return SteerableSingleChunkSummarize(model) + return SteerableSingleChunkSummarize(model=model) @app.post( From a00732de80e3b519a688a737ecb14850531bed58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Mon, 16 Dec 2024 09:44:54 +0100 Subject: [PATCH 12/15] fix: clean up studio notebooks to reuse example and make them execution order independent --- src/documentation/how_tos/example_data.py | 2 ++ .../how_to_aggregate_evaluations.ipynb | 4 ++-- .../studio/how_to_execute_a_benchmark.ipynb | 20 +++++++++---------- ...o_upload_existing_datasets_to_studio.ipynb | 8 +++++--- .../how_to_use_studio_with_traces.ipynb | 16 +++++++-------- 5 files changed, 26 insertions(+), 24 deletions(-) diff --git a/src/documentation/how_tos/example_data.py b/src/documentation/how_tos/example_data.py index 281f3cd6f..9434641c3 100644 --- a/src/documentation/how_tos/example_data.py +++ b/src/documentation/how_tos/example_data.py @@ -112,6 +112,7 @@ class ExampleData: run_overview_2: RunOverview evaluation_overview_1: EvaluationOverview evaluation_overview_2: EvaluationOverview + studio_project_name: str def example_data() -> ExampleData: @@ -159,6 +160,7 @@ def example_data() -> ExampleData: example_data.run_overview_2 = run_overview_2 example_data.evaluation_overview_1 = evaluation_overview_1 example_data.evaluation_overview_2 = evaluation_overview_2 + example_data.studio_project_name = "My Example Project" return example_data diff --git a/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb b/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb index 873861633..b64462376 100644 --- a/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb +++ b/src/documentation/how_tos/how_to_aggregate_evaluations.ipynb @@ -70,7 +70,7 @@ ], "metadata": { "kernelspec": { - "display_name": "intelligence-layer-aL2cXmJM-py3.11", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -84,7 +84,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb index 5d96397f0..4b1b25e4a 100644 --- a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb +++ b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb @@ -9,11 +9,10 @@ "from dotenv import load_dotenv\n", "\n", "from documentation.how_tos.example_data import (\n", - " EXAMPLE_1_INPUT,\n", " DummyAggregationLogic,\n", " DummyEvaluationLogic,\n", - " DummyExample,\n", " DummyTask,\n", + " example_data,\n", ")\n", "from intelligence_layer.connectors.studio.studio import StudioClient\n", "from intelligence_layer.evaluation.benchmark.studio_benchmark import (\n", @@ -24,13 +23,8 @@ ")\n", "\n", "load_dotenv()\n", - "\n", - "examples = [\n", - " DummyExample(input=\"input0\", expected_output=\"expected_output0\", data=\"data0\"),\n", - " DummyExample(\n", - " input=EXAMPLE_1_INPUT, expected_output=\"expected_output1\", data=\"data1\"\n", - " ),\n", - "]" + "my_example_data = example_data()\n", + "examples = my_example_data.examples" ] }, { @@ -69,7 +63,11 @@ "outputs": [], "source": [ "# Step 0\n", - "studio_client = StudioClient(project=\"my project_name\", create_project=True)\n", + "from uuid import uuid4\n", + "\n", + "studio_client = StudioClient(\n", + " project=my_example_data.studio_project_name, create_project=True\n", + ")\n", "\n", "# Step 1\n", "studio_dataset_repository = StudioDatasetRepository(studio_client)\n", @@ -80,7 +78,7 @@ "evaluation_logic = DummyEvaluationLogic()\n", "aggregation_logic = DummyAggregationLogic()\n", "benchmark = studio_benchmark_repository.create_benchmark(\n", - " dataset.id, evaluation_logic, aggregation_logic, \"my_benchmark\"\n", + " dataset.id, evaluation_logic, aggregation_logic, f\"my_benchmark-{uuid4()}\"\n", ")\n", "\n", "# Step 3\n", diff --git a/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb b/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb index d3bebc674..f6971f785 100644 --- a/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb +++ b/src/documentation/how_tos/studio/how_to_upload_existing_datasets_to_studio.ipynb @@ -60,7 +60,9 @@ ")\n", "\n", "# Step 1\n", - "studio_client = StudioClient(project=\"my project_name\")\n", + "studio_client = StudioClient(\n", + " project=my_example_data.studio_project_name, create_project=True\n", + ")\n", "\n", "# Step 2\n", "studio_dataset_repo = StudioDatasetRepository(studio_client=studio_client)\n", @@ -76,7 +78,7 @@ ], "metadata": { "kernelspec": { - "display_name": "intelligence-layer-aL2cXmJM-py3.11", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, @@ -90,7 +92,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.8" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/src/documentation/how_tos/studio/how_to_use_studio_with_traces.ipynb b/src/documentation/how_tos/studio/how_to_use_studio_with_traces.ipynb index 5f7ed9450..405bbe14c 100644 --- a/src/documentation/how_tos/studio/how_to_use_studio_with_traces.ipynb +++ b/src/documentation/how_tos/studio/how_to_use_studio_with_traces.ipynb @@ -6,8 +6,11 @@ "metadata": {}, "outputs": [], "source": [ + "from documentation.how_tos.example_data import DummyTask, example_data\n", "from intelligence_layer.connectors import StudioClient\n", - "from intelligence_layer.core import InMemoryTracer, Task, TaskSpan" + "from intelligence_layer.core import InMemoryTracer\n", + "\n", + "my_example_data = example_data()" ] }, { @@ -45,16 +48,13 @@ "outputs": [], "source": [ "# Step 0\n", - "class DummyTask(Task[str, str]):\n", - " def do_run(self, input: str, task_span: TaskSpan) -> str:\n", - " return f\"{input} -> output\"\n", - "\n", - "\n", "tracer = InMemoryTracer()\n", "DummyTask().run(\"My Dummy Run\", tracer=tracer)\n", "\n", "# Step 1\n", - "studio_client = StudioClient(project=\"my project_name\", create_project=True)\n", + "studio_client = StudioClient(\n", + " project=my_example_data.studio_project_name, create_project=True\n", + ")\n", "\n", "# Step 2.1\n", "trace_to_submit = tracer.export_for_viewing()\n", @@ -70,7 +70,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "intelligence-layer-LP3DLT23-py3.12", "language": "python", "name": "python3" }, From 97b8a5db32a3ab754b2af2f10bc9376b8142ab66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Mon, 16 Dec 2024 11:43:43 +0100 Subject: [PATCH 13/15] fix: document index tests depending on order --- .../studio/how_to_execute_a_benchmark.ipynb | 4 +- tests/conftest.py | 458 +---------------- tests/conftest_document_index.py | 473 ++++++++++++++++++ .../document_index/test_document_index.py | 3 +- .../test_document_index_retriever.py | 6 +- .../test_qdrant_in_memory_retriever.py | 2 +- .../classify/test_embedding_based_classify.py | 2 +- tests/examples/search/test_search.py | 2 +- 8 files changed, 484 insertions(+), 466 deletions(-) create mode 100644 tests/conftest_document_index.py diff --git a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb index 4b1b25e4a..9b23112c9 100644 --- a/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb +++ b/src/documentation/how_tos/studio/how_to_execute_a_benchmark.ipynb @@ -6,6 +6,8 @@ "metadata": {}, "outputs": [], "source": [ + "from uuid import uuid4\n", + "\n", "from dotenv import load_dotenv\n", "\n", "from documentation.how_tos.example_data import (\n", @@ -63,8 +65,6 @@ "outputs": [], "source": [ "# Step 0\n", - "from uuid import uuid4\n", - "\n", "studio_client = StudioClient(\n", " project=my_example_data.studio_project_name, create_project=True\n", ")\n", diff --git a/tests/conftest.py b/tests/conftest.py index 4ace913a9..5fedf40a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,15 +1,8 @@ import os -import random -import re -import string -from collections.abc import Callable, Iterable, Iterator, Sequence -from contextlib import contextmanager -from datetime import datetime, timedelta, timezone -from functools import wraps +from collections.abc import Sequence from os import getenv from pathlib import Path -from time import sleep -from typing import ParamSpec, TypeVar, cast, get_args, overload +from typing import cast from aleph_alpha_client import Client, Image from dotenv import load_dotenv @@ -18,27 +11,10 @@ from intelligence_layer.connectors import ( AlephAlphaClientProtocol, Document, - DocumentChunk, - DocumentIndexClient, - DocumentIndexRetriever, LimitedConcurrencyClient, QdrantInMemoryRetriever, RetrieverType, ) -from intelligence_layer.connectors.base.json_serializable import JsonSerializable -from intelligence_layer.connectors.document_index.document_index import ( - CollectionPath, - DocumentContents, - DocumentPath, - EmbeddingConfig, - HybridIndex, - IndexConfiguration, - IndexPath, - InstructableEmbed, - Representation, - SearchQuery, - SemanticEmbed, -) from intelligence_layer.core import ( LuminousControlModel, NoOpTracer, @@ -54,6 +30,7 @@ InMemoryRunRepository, RunOverview, ) +from tests.conftest_document_index import * # noqa: F403 - we import everything here to get the file to be "appended" to this file and thus making all fixtures available @fixture(scope="session") @@ -88,13 +65,6 @@ def pharia_1_chat_model(client: AlephAlphaClientProtocol) -> Pharia1ChatModel: return Pharia1ChatModel("pharia-1-llm-7b-control", client) -@fixture(scope="session") -def document_index(token: str) -> DocumentIndexClient: - return DocumentIndexClient( - token, base_document_index_url=os.environ["DOCUMENT_INDEX_URL"] - ) - - @fixture def no_op_tracer() -> NoOpTracer: return NoOpTracer() @@ -132,428 +102,6 @@ def symmetric_in_memory_retriever( ) -# document index setup -P = ParamSpec("P") -R = TypeVar("R") - - -@overload -def retry( - func: None = None, max_retries: int = 3, seconds_delay: float = 0.0 -) -> Callable[[Callable[P, R]], Callable[P, R]]: ... - - -@overload -def retry( - func: Callable[P, R], max_retries: int = 3, seconds_delay: float = 0.0 -) -> Callable[P, R]: ... - - -def retry( - func: Callable[P, R] | None = None, - max_retries: int = 60, - seconds_delay: float = 0.5, -) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]: - def decorator(func: Callable[P, R]) -> Callable[P, R]: - @wraps(func) - def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: - for _ in range(1 + max_retries): - try: - return func(*args, **kwargs) - except Exception as e: - last_exception = e - sleep(seconds_delay) - - raise last_exception - - return wrapper - - if func is None: - return decorator - else: - return decorator(func) - - -def random_alphanumeric_string(length: int = 20) -> str: - return "".join(random.choices(string.ascii_letters + string.digits, k=length)) - - -def random_identifier() -> str: - name = random_alphanumeric_string(10) - timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") - return f"intelligence-layer-ci-{name}-{timestamp}" - - -def is_outdated_identifier(identifier: str, timestamp_threshold: datetime) -> bool: - # match the format that is defined in random_identifier() - matched = re.match( - r"^intelligence-layer-ci-[a-zA-Z0-9]{10}-(?P\d{8}T\d{6})$", - identifier, - ) - if matched is None: - return False - - timestamp = datetime.strptime(matched["timestamp"], "%Y%m%dT%H%M%S").replace( - tzinfo=timezone.utc - ) - return not timestamp > timestamp_threshold - - -def random_semantic_embed() -> EmbeddingConfig: - return SemanticEmbed( - representation=random.choice(get_args(Representation)), - model_name="luminous-base", - ) - - -def random_instructable_embed() -> EmbeddingConfig: - return InstructableEmbed( - model_name="pharia-1-embedding-4608-control", - query_instruction=random_alphanumeric_string(), - document_instruction=random_alphanumeric_string(), - ) - - -def random_embedding_config() -> EmbeddingConfig: - return random.choice([random_semantic_embed(), random_instructable_embed()]) - - -@fixture -def document_contents() -> DocumentContents: - text = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change. - -Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies. - -Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief. - -In the post-war years, Pemberton relocated to Atlanta, Georgia, where he continued to experiment with various medicinal syrups and tonics. It was during this time, in the late 19th century, that he developed a beverage he initially called "Pemberton's French Wine Coca." This concoction was inspired by Vin Mariani, a popular French tonic wine that contained coca leaves. Pemberton's beverage was intended to serve not just as a refreshing drink but also as a remedy for various ailments, including morphine addiction, indigestion, and headaches. - -However, in 1886, when Atlanta introduced prohibition legislation, Pemberton was compelled to create a non-alcoholic version of his beverage. He experimented with a combination of carbonated water, coca leaf extract, kola nut, and other ingredients, eventually perfecting the formula for what would soon become Coca-Cola. The name was suggested by his bookkeeper, Frank Robinson, who also created the distinctive cursive logo that is still in use today. - -Pemberton advertised his new creation as a "brain tonic" and "temperance drink," asserting that it could alleviate headaches and fatigue. However, due to his declining health and financial difficulties, Pemberton was eventually compelled to sell portions of his business to various partners. Shortly before his death in 1888, he sold his remaining stake in Coca-Cola to Asa G. Candler, a fellow pharmacist and businessman. - -Under Candler's leadership, Coca-Cola transformed from a pharmacist's concoction into a mass-produced and marketed beverage that became a staple of American culture and a global icon. Despite the changes and the immense growth of the brand, the legacy of John Stith Pemberton as the inventor of Coca-Cola remains an integral part of the beverage's history. - -Pemberton's life story is a testament to the spirit of innovation and resilience. His creation, borne out of personal struggles and the context of his times, went on to transcend its origins and become a symbol recognized across the globe. Today, when we think of Coca-Cola, we are reminded of Pemberton's journey from a small-town pharmacist to the creator of one of the world's most enduring and beloved brands.""" - return DocumentContents(contents=[text], metadata={"Some": "Metadata"}) - - -@fixture(scope="session") -def document_contents_with_metadata() -> list[DocumentContents]: - text_1 = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change.""" - text_2 = """Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies.""" - text_3 = """Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief.""" - - metadata_1: JsonSerializable = { - "string-field": "example_string_1", - "integer-field": 123, - "float-field": 123.45, - "boolean-field": True, - "date-field": datetime(2022, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - metadata_2: JsonSerializable = { - "string-field": "example_string_2", - "integer-field": 456, - "float-field": 678.90, - "boolean-field": False, - "date-field": datetime(2023, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - metadata_3: JsonSerializable = { - "string-field": "example_string_3", - "integer-field": 789, - "float-field": 101112.13, - "boolean-field": True, - "date-field": datetime(2024, 1, 1, tzinfo=timezone.utc) - .isoformat(timespec="seconds") - .replace("+00:00", "Z"), - } - - return [ - DocumentContents(contents=[text_1], metadata=metadata_1), - DocumentContents(contents=[text_2], metadata=metadata_2), - DocumentContents(contents=[text_3], metadata=metadata_3), - ] - - -@fixture(scope="session") -def document_index_namespace(document_index: DocumentIndexClient) -> Iterable[str]: - yield "Search" - _teardown(document_index, "Search") - - -def _teardown( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[None]: - yield - - # Cleanup leftover resources from previous runs. - timestamp_threshold = datetime.now(timezone.utc) - timedelta(hours=1) - - collections = document_index.list_collections(document_index_namespace) - for collection_path in collections: - if is_outdated_identifier(collection_path.collection, timestamp_threshold): - document_index.delete_collection(collection_path) - - indexes = document_index.list_indexes(document_index_namespace) - for index_path in indexes: - if is_outdated_identifier(index_path.index, timestamp_threshold): - document_index.delete_index(index_path) - - filter_indexes = document_index.list_filter_indexes_in_namespace( - document_index_namespace - ) - for filter_index in filter_indexes: - if is_outdated_identifier(filter_index, timestamp_threshold): - document_index.delete_filter_index_from_namespace( - document_index_namespace, filter_index - ) - - -@fixture(scope="session") -def filter_index_configs() -> dict[str, dict[str, str]]: - return { - random_identifier(): { - "field-name": "string-field", - "field-type": "string", - }, - random_identifier(): { - "field-name": "integer-field", - "field-type": "integer", - }, - random_identifier(): { - "field-name": "float-field", - "field-type": "float", - }, - random_identifier(): { - "field-name": "boolean-field", - "field-type": "boolean", - }, - random_identifier(): { - "field-name": "date-field", - "field-type": "date_time", - }, - } - - -@contextmanager -def random_index_with_embedding_config( - document_index: DocumentIndexClient, - document_index_namespace: str, - embedding_config: EmbeddingConfig, -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - name = random_identifier() - - chunk_size, chunk_overlap = sorted( - random.sample([0, 32, 64, 128, 256, 512, 1024], 2), reverse=True - ) - - hybrid_index_choices: list[HybridIndex] = ["bm25", None] - hybrid_index = random.choice(hybrid_index_choices) - - index = IndexPath(namespace=document_index_namespace, index=name) - index_configuration = IndexConfiguration( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - hybrid_index=hybrid_index, - embedding=embedding_config, - ) - try: - document_index.create_index(index, index_configuration) - yield index, index_configuration - finally: - document_index.delete_index(index) - - -@fixture -def random_instructable_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, document_index_namespace, random_instructable_embed() - ) as index: - yield index - - -@fixture -def random_semantic_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, document_index_namespace, random_semantic_embed() - ) as index: - yield index - - -@fixture -def random_index( - document_index: DocumentIndexClient, document_index_namespace: str -) -> Iterator[tuple[IndexPath, IndexConfiguration]]: - with random_index_with_embedding_config( - document_index, - document_index_namespace, - random.choice([random_semantic_embed(), random_instructable_embed()]), - ) as index: - yield index - - -@fixture -def random_collection( - document_index: DocumentIndexClient, - document_index_namespace: str, -) -> Iterator[CollectionPath]: - collection_name = random_identifier() - collection_path = CollectionPath( - namespace=document_index_namespace, collection=collection_name - ) - try: - document_index.create_collection(collection_path) - - yield collection_path - finally: - document_index.delete_collection(collection_path) - - -@fixture(scope="session") -def read_only_populated_collection( - document_index: DocumentIndexClient, - document_index_namespace: str, - document_contents_with_metadata: list[DocumentContents], - filter_index_configs: dict[str, dict[str, str]], -) -> Iterator[tuple[CollectionPath, IndexPath]]: - index_name = random_identifier() - index_path = IndexPath(namespace=document_index_namespace, index=index_name) - index_configuration = IndexConfiguration( - chunk_size=512, - chunk_overlap=0, - hybrid_index="bm25", - embedding=SemanticEmbed( - representation="asymmetric", - model_name="luminous-base", - ), - ) - - collection_name = random_identifier() - collection_path = CollectionPath( - namespace=document_index_namespace, collection=collection_name - ) - - try: - document_index.create_collection(collection_path) - document_index.create_index(index_path, index_configuration) - document_index.assign_index_to_collection(collection_path, index_name) - - for name, config in filter_index_configs.items(): - document_index.create_filter_index_in_namespace( - namespace=document_index_namespace, - filter_index_name=name, - field_name=config["field-name"], - field_type=config["field-type"], # type:ignore[arg-type] - ) - document_index.assign_filter_index_to_search_index( - collection_path=collection_path, - index_name=index_name, - filter_index_name=name, - ) - - for i, content in enumerate(document_contents_with_metadata): - document_index.add_document( - DocumentPath( - collection_path=collection_path, - document_name=f"document-{i}", - ), - content, - ) - - yield collection_path, index_path - finally: - document_index.delete_collection(collection_path) - - @retry - def clean_up_indexes() -> None: - document_index.delete_index(index_path) - for filter_index_name in filter_index_configs: - document_index.delete_filter_index_from_namespace( - document_index_namespace, filter_index_name - ) - - clean_up_indexes() - - -@fixture -def random_searchable_collection( - document_index: DocumentIndexClient, - document_contents_with_metadata: list[DocumentContents], - random_index: tuple[IndexPath, IndexConfiguration], - random_collection: CollectionPath, -) -> Iterator[tuple[CollectionPath, IndexPath]]: - index_path, _ = random_index - index_name = index_path.index - collection_path = random_collection - - try: - # Assign index - document_index.assign_index_to_collection(collection_path, index_name) - - # Add 3 documents - for i, content in enumerate(document_contents_with_metadata): - document_index.add_document( - DocumentPath( - collection_path=collection_path, - document_name=f"document-{i}", - ), - content, - ) - - # Ensure documents are searchable; this allows time for indexing - @retry - def search() -> None: - search_result = document_index.search( - collection_path, - index_name, - SearchQuery( - query="Coca-Cola", - ), - ) - assert len(search_result) > 0 - - search() - - yield collection_path, index_path - finally: - document_index.delete_collection(collection_path) - - @retry - def clean_up_index() -> None: - document_index.delete_index(index_path) - - clean_up_index() - - -# end document index setup -@fixture -def document_index_retriever( - random_searchable_collection: tuple[CollectionPath, IndexPath], - document_index: DocumentIndexClient, -) -> DocumentIndexRetriever: - return DocumentIndexRetriever( - document_index, - index_name=random_searchable_collection[1].index, - namespace=random_searchable_collection[0].namespace, - collection=random_searchable_collection[0].collection, - k=2, - ) - - -def to_document(document_chunk: DocumentChunk) -> Document: - return Document(text=document_chunk.text, metadata=document_chunk.metadata) - - @fixture def in_memory_dataset_repository() -> InMemoryDatasetRepository: return InMemoryDatasetRepository() diff --git a/tests/conftest_document_index.py b/tests/conftest_document_index.py new file mode 100644 index 000000000..9950fe7e7 --- /dev/null +++ b/tests/conftest_document_index.py @@ -0,0 +1,473 @@ +import os +import random +import re +import string +from collections.abc import Callable, Iterable, Iterator +from contextlib import contextmanager +from datetime import datetime, timedelta, timezone +from functools import wraps +from time import sleep +from typing import ParamSpec, TypeVar, get_args, overload + +from pytest import fixture + +from intelligence_layer.connectors import ( + DocumentIndexClient, + DocumentIndexRetriever, +) +from intelligence_layer.connectors.base.json_serializable import JsonSerializable +from intelligence_layer.connectors.document_index.document_index import ( + CollectionPath, + DocumentContents, + DocumentPath, + EmbeddingConfig, + HybridIndex, + IndexConfiguration, + IndexPath, + InstructableEmbed, + Representation, + SearchQuery, + SemanticEmbed, +) +from intelligence_layer.connectors.retrievers.base_retriever import ( + Document, + DocumentChunk, +) + +P = ParamSpec("P") +R = TypeVar("R") + + +@fixture(scope="session") +def document_index(token: str) -> DocumentIndexClient: + return DocumentIndexClient( + token, base_document_index_url=os.environ["DOCUMENT_INDEX_URL"] + ) + + +def to_document(document_chunk: DocumentChunk) -> Document: + return Document(text=document_chunk.text, metadata=document_chunk.metadata) + + +@overload +def retry( + func: None = None, max_retries: int = 3, seconds_delay: float = 0.0 +) -> Callable[[Callable[P, R]], Callable[P, R]]: ... + + +@overload +def retry( + func: Callable[P, R], max_retries: int = 3, seconds_delay: float = 0.0 +) -> Callable[P, R]: ... + + +def retry( + func: Callable[P, R] | None = None, + max_retries: int = 60, + seconds_delay: float = 0.5, +) -> Callable[[Callable[P, R]], Callable[P, R]] | Callable[P, R]: + def decorator(func: Callable[P, R]) -> Callable[P, R]: + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + for _ in range(1 + max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + last_exception = e + sleep(seconds_delay) + + raise last_exception + + return wrapper + + if func is None: + return decorator + else: + return decorator(func) + + +def random_alphanumeric_string(length: int = 20) -> str: + return "".join(random.choices(string.ascii_letters + string.digits, k=length)) + + +def random_identifier() -> str: + name = random_alphanumeric_string(10) + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") + return f"intelligence-layer-ci-{name}-{timestamp}" + + +def is_outdated_identifier(identifier: str, timestamp_threshold: datetime) -> bool: + # match the format that is defined in random_identifier() + matched = re.match( + r"^intelligence-layer-ci-[a-zA-Z0-9]{10}-(?P\d{8}T\d{6})$", + identifier, + ) + if matched is None: + return False + + timestamp = datetime.strptime(matched["timestamp"], "%Y%m%dT%H%M%S").replace( + tzinfo=timezone.utc + ) + return not timestamp > timestamp_threshold + + +def random_semantic_embed() -> EmbeddingConfig: + return SemanticEmbed( + representation=random.choice(get_args(Representation)), + model_name="luminous-base", + ) + + +def random_instructable_embed() -> EmbeddingConfig: + return InstructableEmbed( + model_name="pharia-1-embedding-4608-control", + query_instruction=random_alphanumeric_string(), + document_instruction=random_alphanumeric_string(), + ) + + +def random_embedding_config() -> EmbeddingConfig: + return random.choice([random_semantic_embed(), random_instructable_embed()]) + + +@fixture +def document_contents() -> DocumentContents: + text = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change. + +Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies. + +Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief. + +In the post-war years, Pemberton relocated to Atlanta, Georgia, where he continued to experiment with various medicinal syrups and tonics. It was during this time, in the late 19th century, that he developed a beverage he initially called "Pemberton's French Wine Coca." This concoction was inspired by Vin Mariani, a popular French tonic wine that contained coca leaves. Pemberton's beverage was intended to serve not just as a refreshing drink but also as a remedy for various ailments, including morphine addiction, indigestion, and headaches. + +However, in 1886, when Atlanta introduced prohibition legislation, Pemberton was compelled to create a non-alcoholic version of his beverage. He experimented with a combination of carbonated water, coca leaf extract, kola nut, and other ingredients, eventually perfecting the formula for what would soon become Coca-Cola. The name was suggested by his bookkeeper, Frank Robinson, who also created the distinctive cursive logo that is still in use today. + +Pemberton advertised his new creation as a "brain tonic" and "temperance drink," asserting that it could alleviate headaches and fatigue. However, due to his declining health and financial difficulties, Pemberton was eventually compelled to sell portions of his business to various partners. Shortly before his death in 1888, he sold his remaining stake in Coca-Cola to Asa G. Candler, a fellow pharmacist and businessman. + +Under Candler's leadership, Coca-Cola transformed from a pharmacist's concoction into a mass-produced and marketed beverage that became a staple of American culture and a global icon. Despite the changes and the immense growth of the brand, the legacy of John Stith Pemberton as the inventor of Coca-Cola remains an integral part of the beverage's history. + +Pemberton's life story is a testament to the spirit of innovation and resilience. His creation, borne out of personal struggles and the context of his times, went on to transcend its origins and become a symbol recognized across the globe. Today, when we think of Coca-Cola, we are reminded of Pemberton's journey from a small-town pharmacist to the creator of one of the world's most enduring and beloved brands.""" + return DocumentContents(contents=[text], metadata={"Some": "Metadata"}) + + +@fixture(scope="session") +def document_contents_with_metadata() -> list[DocumentContents]: + text_1 = """John Stith Pemberton, the inventor of the world-renowned beverage Coca-Cola, was a figure whose life was marked by creativity, entrepreneurial spirit, and the turbulent backdrop of 19th-century America. Born on January 8, 1831, in Knoxville, Georgia, Pemberton grew up in an era of profound transformation and change.""" + text_2 = """Pemberton began his professional journey by studying medicine and pharmacy. After earning a degree in pharmacy, he started his career as a druggist in Columbus, Georgia. He was known for his keen interest in creating medicinal concoctions and was well-respected in his community. His early creations included various medicines and tonics, which were typical of the times when pharmacists often concocted their own remedies.""" + text_3 = """Pemberton's life took a significant turn during the American Civil War. He served as a lieutenant colonel in the Confederate Army, and it was during this period that he sustained a wound that led him to become dependent on morphine. This personal struggle with addiction likely influenced his later work in seeking out alternatives and remedies for pain relief.""" + + metadata_1: JsonSerializable = { + "string-field": "example_string_1", + "integer-field": 123, + "float-field": 123.45, + "boolean-field": True, + "date-field": datetime(2022, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + metadata_2: JsonSerializable = { + "string-field": "example_string_2", + "integer-field": 456, + "float-field": 678.90, + "boolean-field": False, + "date-field": datetime(2023, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + metadata_3: JsonSerializable = { + "string-field": "example_string_3", + "integer-field": 789, + "float-field": 101112.13, + "boolean-field": True, + "date-field": datetime(2024, 1, 1, tzinfo=timezone.utc) + .isoformat(timespec="seconds") + .replace("+00:00", "Z"), + } + + return [ + DocumentContents(contents=[text_1], metadata=metadata_1), + DocumentContents(contents=[text_2], metadata=metadata_2), + DocumentContents(contents=[text_3], metadata=metadata_3), + ] + + +@fixture(scope="session") +def document_index_namespace(document_index: DocumentIndexClient) -> Iterable[str]: + yield "Search" + _teardown(document_index, "Search") + + +def _teardown( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[None]: + yield + + # Cleanup leftover resources from previous runs. + timestamp_threshold = datetime.now(timezone.utc) - timedelta(hours=1) + + collections = document_index.list_collections(document_index_namespace) + for collection_path in collections: + if is_outdated_identifier(collection_path.collection, timestamp_threshold): + document_index.delete_collection(collection_path) + + indexes = document_index.list_indexes(document_index_namespace) + for index_path in indexes: + if is_outdated_identifier(index_path.index, timestamp_threshold): + document_index.delete_index(index_path) + + filter_indexes = document_index.list_filter_indexes_in_namespace( + document_index_namespace + ) + for filter_index in filter_indexes: + if is_outdated_identifier(filter_index, timestamp_threshold): + document_index.delete_filter_index_from_namespace( + document_index_namespace, filter_index + ) + + +@fixture(scope="session") +def filter_index_configs( + document_index: DocumentIndexClient, + document_index_namespace: str, +) -> dict[str, dict[str, str]]: + configs = { + random_identifier(): { + "field-name": "string-field", + "field-type": "string", + }, + random_identifier(): { + "field-name": "integer-field", + "field-type": "integer", + }, + random_identifier(): { + "field-name": "float-field", + "field-type": "float", + }, + random_identifier(): { + "field-name": "boolean-field", + "field-type": "boolean", + }, + random_identifier(): { + "field-name": "date-field", + "field-type": "date_time", + }, + } + + for name, config in configs.items(): + document_index.create_filter_index_in_namespace( + namespace=document_index_namespace, + filter_index_name=name, + field_name=config["field-name"], + field_type=config["field-type"], # type:ignore[arg-type] + ) + + return configs + + +@contextmanager +def random_index_with_embedding_config( + document_index: DocumentIndexClient, + document_index_namespace: str, + embedding_config: EmbeddingConfig, +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + name = random_identifier() + + chunk_size, chunk_overlap = sorted( + random.sample([0, 32, 64, 128, 256, 512, 1024], 2), reverse=True + ) + + hybrid_index_choices: list[HybridIndex] = ["bm25", None] + hybrid_index = random.choice(hybrid_index_choices) + + index = IndexPath(namespace=document_index_namespace, index=name) + index_configuration = IndexConfiguration( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + hybrid_index=hybrid_index, + embedding=embedding_config, + ) + try: + document_index.create_index(index, index_configuration) + yield index, index_configuration + finally: + document_index.delete_index(index) + + +@fixture +def random_instructable_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, document_index_namespace, random_instructable_embed() + ) as index: + yield index + + +@fixture +def random_semantic_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, document_index_namespace, random_semantic_embed() + ) as index: + yield index + + +@fixture +def random_index( + document_index: DocumentIndexClient, document_index_namespace: str +) -> Iterator[tuple[IndexPath, IndexConfiguration]]: + with random_index_with_embedding_config( + document_index, + document_index_namespace, + random.choice([random_semantic_embed(), random_instructable_embed()]), + ) as index: + yield index + + +@fixture +def random_collection( + document_index: DocumentIndexClient, + document_index_namespace: str, +) -> Iterator[CollectionPath]: + collection_name = random_identifier() + collection_path = CollectionPath( + namespace=document_index_namespace, collection=collection_name + ) + try: + document_index.create_collection(collection_path) + + yield collection_path + finally: + document_index.delete_collection(collection_path) + + +def _add_documents_to_document_index( + document_index: DocumentIndexClient, + documents: list[DocumentContents], + index_name: str, + collection_path: CollectionPath, +): + # Add all documents + for i, content in enumerate(documents): + document_index.add_document( + DocumentPath( + collection_path=collection_path, + document_name=f"document-{i}", + ), + content, + ) + + # Ensure documents are searchable; this allows time for indexing + @retry + def search() -> None: + search_result = document_index.search( + collection_path, + index_name, + SearchQuery( + query="Coca-Cola", + ), + ) + assert len(search_result) > 0 + + search() + + +@fixture(scope="session") +def read_only_populated_collection( + document_index: DocumentIndexClient, + document_index_namespace: str, + document_contents_with_metadata: list[DocumentContents], + filter_index_configs: dict[str, dict[str, str]], +) -> Iterator[tuple[CollectionPath, IndexPath]]: + index_name = random_identifier() + index_path = IndexPath(namespace=document_index_namespace, index=index_name) + index_configuration = IndexConfiguration( + chunk_size=512, + chunk_overlap=0, + hybrid_index="bm25", + embedding=SemanticEmbed( + representation="asymmetric", + model_name="luminous-base", + ), + ) + + collection_name = random_identifier() + collection_path = CollectionPath( + namespace=document_index_namespace, collection=collection_name + ) + + try: + document_index.create_collection(collection_path) + document_index.create_index(index_path, index_configuration) + document_index.assign_index_to_collection(collection_path, index_name) + + for name in filter_index_configs: + document_index.assign_filter_index_to_search_index( + collection_path=collection_path, + index_name=index_name, + filter_index_name=name, + ) + _add_documents_to_document_index( + document_index, document_contents_with_metadata, index_name, collection_path + ) + + yield collection_path, index_path + finally: + document_index.delete_collection(collection_path) + + @retry + def clean_up_indexes() -> None: + document_index.delete_index(index_path) + for filter_index_name in filter_index_configs: + document_index.delete_filter_index_from_namespace( + document_index_namespace, filter_index_name + ) + + clean_up_indexes() + + +@fixture +def random_searchable_collection( + document_index: DocumentIndexClient, + document_contents_with_metadata: list[DocumentContents], + random_index: tuple[IndexPath, IndexConfiguration], + random_collection: CollectionPath, +) -> Iterator[tuple[CollectionPath, IndexPath]]: + index_path, _ = random_index + index_name = index_path.index + collection_path = random_collection + + try: + # Assign index + document_index.assign_index_to_collection(collection_path, index_name) + + _add_documents_to_document_index( + document_index, document_contents_with_metadata, index_name, collection_path + ) + + yield collection_path, index_path + finally: + document_index.delete_collection(collection_path) + + @retry + def clean_up_index() -> None: + document_index.delete_index(index_path) + + clean_up_index() + + +@fixture +def document_index_retriever( + read_only_populated_collection: tuple[CollectionPath, IndexPath], + document_index: DocumentIndexClient, +) -> DocumentIndexRetriever: + return DocumentIndexRetriever( + document_index, + index_name=read_only_populated_collection[1].index, + namespace=read_only_populated_collection[0].namespace, + collection=read_only_populated_collection[0].collection, + k=2, + ) diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index db4a168ef..97f2bed2e 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -20,7 +20,7 @@ ResourceNotFound, SearchQuery, ) -from tests.conftest import random_embedding_config, retry +from tests.conftest_document_index import random_embedding_config, retry @pytest.mark.internal @@ -344,7 +344,6 @@ def test_assign_filter_indexes_to_collection( assigned_indexes = document_index.list_assigned_filter_index_names( collection_path, index_name ) - assert all( filter_index in assigned_indexes for filter_index in filter_index_configs ) diff --git a/tests/connectors/retrievers/test_document_index_retriever.py b/tests/connectors/retrievers/test_document_index_retriever.py index 575d0424b..faee45f5f 100644 --- a/tests/connectors/retrievers/test_document_index_retriever.py +++ b/tests/connectors/retrievers/test_document_index_retriever.py @@ -9,7 +9,5 @@ def test_document_index_retriever( document_index_retriever: DocumentIndexRetriever, ) -> None: - documents = document_index_retriever.get_relevant_documents_with_scores( - "Who took part in the war?" - ) - assert len(documents) == 2 + documents = document_index_retriever.get_relevant_documents_with_scores("Coca-Cola") + assert len(documents) > 0 diff --git a/tests/connectors/retrievers/test_qdrant_in_memory_retriever.py b/tests/connectors/retrievers/test_qdrant_in_memory_retriever.py index 6df4929ea..555258151 100644 --- a/tests/connectors/retrievers/test_qdrant_in_memory_retriever.py +++ b/tests/connectors/retrievers/test_qdrant_in_memory_retriever.py @@ -6,7 +6,7 @@ from intelligence_layer.connectors.retrievers.qdrant_in_memory_retriever import ( QdrantInMemoryRetriever, ) -from tests.conftest import to_document +from tests.conftest_document_index import to_document @fixture diff --git a/tests/examples/classify/test_embedding_based_classify.py b/tests/examples/classify/test_embedding_based_classify.py index ce93208df..9eee5829c 100644 --- a/tests/examples/classify/test_embedding_based_classify.py +++ b/tests/examples/classify/test_embedding_based_classify.py @@ -22,7 +22,7 @@ QdrantSearch, QdrantSearchInput, ) -from tests.conftest import to_document +from tests.conftest_document_index import to_document @fixture diff --git a/tests/examples/search/test_search.py b/tests/examples/search/test_search.py index 94015204b..f515c810b 100644 --- a/tests/examples/search/test_search.py +++ b/tests/examples/search/test_search.py @@ -20,7 +20,7 @@ SearchInput, SearchOutput, ) -from tests.conftest import to_document +from tests.conftest_document_index import to_document @fixture From 90ad317798d6169775165cd74621c3967ca0ed75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Mon, 16 Dec 2024 12:00:37 +0100 Subject: [PATCH 14/15] fix: issue where error was not thrown correctly in studio benchmark --- .../evaluation/benchmark/studio_benchmark.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py index 988e1af93..23235024c 100644 --- a/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py +++ b/src/intelligence_layer/evaluation/benchmark/studio_benchmark.py @@ -220,6 +220,10 @@ def create_benchmark( except requests.HTTPError as e: if e.response.status_code == HTTPStatus.BAD_REQUEST: raise ValueError(f"Dataset with ID {dataset_id} not found") from e + else: + raise ValueError( + "An error occurred when attempting to create a benchmark." + ) from e return StudioBenchmark( benchmark_id, From 81940074e8e38dbcff2385404fe80dd7e21fd7ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20K=C3=B6hnecke?= Date: Mon, 16 Dec 2024 12:00:58 +0100 Subject: [PATCH 15/15] docs: update changelog about new env variables --- .env.example | 9 ++++++--- CHANGELOG.md | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.env.example b/.env.example index fb3ccf8dc..c702fcdae 100644 --- a/.env.example +++ b/.env.example @@ -12,11 +12,14 @@ POSTGRES_DB=il_sdk POSTGRES_USER=il_sdk POSTGRES_PASSWORD=test -# things to adapt +# ---- Things to adapt ---- CLIENT_URL=... -AUTHORIZATION_SERVICE_URL=... AA_TOKEN=token -DATA_SERVICE_URL=... DOCUMENT_INDEX_URL=... +# needed for studio integration +DATA_SERVICE_URL=... +AUTHORIZATION_SERVICE_URL=... + +# needed for hugging face integration HUGGING_FACE_TOKEN=token diff --git a/CHANGELOG.md b/CHANGELOG.md index c513248c9..7160f4f67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,9 @@ ### Breaking Changes - The env variable `POSTGRES_HOST` is split into `POSTGRES_HOST` and `POSTGRES_PORT`. This affects all classes interacting with Studio and the `InstructionFinetuningDataRepository`. + - The following env variables now need to be set (previously pointed to defaults) + - `CLIENT_URL` - URL of your inference stack + - `DOCUMENT_INDEX_URL` - URL of the document index ## 8.0.0