From c42c1f43ba8fdab074dfb81d8a557b9e55c44961 Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Mon, 28 Oct 2024 11:03:46 +0100 Subject: [PATCH] Update the example based on review --- .github/workflows/azure_ai_search.yml | 1 - .../example/embedding_retrieval.py | 19 +++++++++++++++---- .../azure_ai_search/tests/conftest.py | 6 ++++-- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/.github/workflows/azure_ai_search.yml b/.github/workflows/azure_ai_search.yml index f6d544865..294bb4c64 100644 --- a/.github/workflows/azure_ai_search.yml +++ b/.github/workflows/azure_ai_search.yml @@ -30,7 +30,6 @@ jobs: runs-on: ${{ matrix.os }} strategy: fail-fast: false - max-parallel: 1 matrix: os: [ubuntu-latest] python-version: ["3.8", "3.9", "3.10", "3.11"] diff --git a/integrations/azure_ai_search/example/embedding_retrieval.py b/integrations/azure_ai_search/example/embedding_retrieval.py index 20904f5f7..e323c33e3 100644 --- a/integrations/azure_ai_search/example/embedding_retrieval.py +++ b/integrations/azure_ai_search/example/embedding_retrieval.py @@ -1,12 +1,13 @@ from haystack import Document, Pipeline from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.writers import DocumentWriter from haystack.document_stores.types import DuplicatePolicy from haystack_integrations.components.retrievers.azure_ai_search import AzureAISearchEmbeddingRetriever from haystack_integrations.document_stores.azure_ai_search import AzureAISearchDocumentStore """ -This example demonstrates how to use the AzureAISearchEmbeddingRetriever to retrieve documents based on a query. +This example demonstrates how to use the AzureAISearchEmbeddingRetriever to retrieve documents using embeddings based on a query. To run this example, you'll need an Azure Search service endpoint and API key, which can either be set as environment variables (AZURE_SEARCH_SERVICE_ENDPOINT and AZURE_SEARCH_API_KEY) or provided directly to AzureAISearchDocumentStore(as params "api_key", "azure_endpoint"). @@ -14,7 +15,7 @@ See more details at https://learn.microsoft.com/en-us/azure/search/keyless-connections?tabs=python%2Cazure-cli """ -document_store = AzureAISearchDocumentStore() +document_store = AzureAISearchDocumentStore(index_name="retrieval-example") model = "sentence-transformers/all-mpnet-base-v2" @@ -32,8 +33,18 @@ document_embedder = SentenceTransformersDocumentEmbedder(model=model) document_embedder.warm_up() -documents_with_embeddings = document_embedder.run(documents) -document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.SKIP) + +# Indexing Pipeline +indexing_pipeline = Pipeline() +indexing_pipeline.add_component(instance=document_embedder, name="doc_embedder") +indexing_pipeline.add_component( + instance=DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP), name="doc_writer" +) +indexing_pipeline.connect("doc_embedder", "doc_writer") + +indexing_pipeline.run({"doc_embedder": {"documents": documents}}) + +# Query Pipeline query_pipeline = Pipeline() query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder(model=model)) query_pipeline.add_component("retriever", AzureAISearchEmbeddingRetriever(document_store=document_store)) diff --git a/integrations/azure_ai_search/tests/conftest.py b/integrations/azure_ai_search/tests/conftest.py index 2427d2550..48549d244 100644 --- a/integrations/azure_ai_search/tests/conftest.py +++ b/integrations/azure_ai_search/tests/conftest.py @@ -1,5 +1,6 @@ import os import time +import uuid import pytest from azure.core.credentials import AzureKeyCredential @@ -10,7 +11,7 @@ from haystack_integrations.document_stores.azure_ai_search import AzureAISearchDocumentStore # This is the approximate time in seconds it takes for the documents to be available in Azure Search index -SLEEP_TIME_IN_SECONDS = 10 +SLEEP_TIME_IN_SECONDS = 5 @pytest.fixture() @@ -24,7 +25,8 @@ def document_store(request): This is the most basic requirement for the child class: provide an instance of this document store so the base class can use it. """ - index_name = "haystack_test_integration" + index_name = f"haystack_test_{uuid.uuid4().hex}" + print (index_name) metadata_fields = getattr(request, "param", {}).get("metadata_fields", None) azure_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]