diff --git a/haystack/document_stores/opensearch.py b/haystack/document_stores/opensearch.py index 0d6e776660..8194b5aa40 100644 --- a/haystack/document_stores/opensearch.py +++ b/haystack/document_stores/opensearch.py @@ -1521,6 +1521,7 @@ def get_metadata_values_by_key( filters: Optional[FilterType] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, + batch_size: int = 10, ) -> List[dict]: """ Get values associated with a metadata key. The output is in the format: @@ -1558,10 +1559,16 @@ def get_metadata_values_by_key( self.index is used. :param headers: Custom HTTP headers to pass to the client (for example, {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out [Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html) for more information. + :param batch_size: Maximum number of results for each request. + Limited to 10 values by default. You can increase this limit to decrease retrieval time. + To reduce the pressure on the cluster, you shouldn't set this higher than 1,000. """ + index = index or self.index body: dict = { "size": 0, - "aggs": {"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}]}}}, + "aggs": { + "metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}], "size": batch_size}} + }, } if query: body["query"] = { diff --git a/haystack/document_stores/search_engine.py b/haystack/document_stores/search_engine.py index eac2a6ec7a..8f5a149e31 100644 --- a/haystack/document_stores/search_engine.py +++ b/haystack/document_stores/search_engine.py @@ -286,6 +286,7 @@ def get_metadata_values_by_key( filters: Optional[FilterType] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, + batch_size: int = 10, ) -> List[dict]: """ Get values associated with a metadata key. The output is in the format: @@ -323,10 +324,16 @@ def get_metadata_values_by_key( self.index will be used. :param headers: Custom HTTP headers to pass to the client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + :param batch_size: Maximum number of results for each request. + Limited to 10 values by default. You can increase this limit to decrease retrieval time. + To reduce the pressure on the cluster, you shouldn't set this higher than 1,000. """ + index = index or self.index body: dict = { "size": 0, - "aggs": {"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}]}}}, + "aggs": { + "metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}], "size": batch_size}} + }, } if query: body["query"] = { diff --git a/test/document_stores/test_search_engine.py b/test/document_stores/test_search_engine.py index fc819fa7ed..30f4225b07 100644 --- a/test/document_stores/test_search_engine.py +++ b/test/document_stores/test_search_engine.py @@ -4,7 +4,7 @@ import numpy as np import pytest from haystack.document_stores.search_engine import SearchEngineDocumentStore -from haystack.schema import FilterType +from haystack.schema import Document, FilterType @pytest.mark.unit @@ -60,6 +60,14 @@ def test_get_meta_values_by_key(self, ds, documents): result = ds.get_metadata_values_by_key(key="year", query="Bar") assert result == [{"count": 3, "value": "2021"}] + @pytest.mark.integration + def test_get_meta_values_by_key_with_batch_size(self, ds): + docs = [Document(f"content_{i}", meta={"name": f"name_{i}"}) for i in range(10_000)] + ds.write_documents(docs) + + result = ds.get_metadata_values_by_key(key="name", batch_size=1_000) + assert result == sorted([{"count": 1, "value": f"name_{i}"} for i in range(10_000)], key=lambda x: x["value"]) + @pytest.mark.unit def test_query_return_embedding_true(self, mocked_document_store): mocked_document_store.return_embedding = True