Skip to content

Commit

Permalink
fix: use docstore.index for get_metadata_values_by_key (OpenSearch / …
Browse files Browse the repository at this point in the history
…elasticsearch) (#7562)

* fix: use docstore.index for get_metadata_values_by_key (OpenSearch / elasticsearch)

* add batch_size param
  • Loading branch information
tstadel authored Apr 22, 2024
1 parent 6d320f6 commit 28eda4b
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 3 deletions.
9 changes: 8 additions & 1 deletion haystack/document_stores/opensearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1521,6 +1521,7 @@ def get_metadata_values_by_key(
filters: Optional[FilterType] = None,
index: Optional[str] = None,
headers: Optional[Dict[str, str]] = None,
batch_size: int = 10,
) -> List[dict]:
"""
Get values associated with a metadata key. The output is in the format:
Expand Down Expand Up @@ -1558,10 +1559,16 @@ def get_metadata_values_by_key(
self.index is used.
:param headers: Custom HTTP headers to pass to the client (for example, {'Authorization': 'Basic YWRtaW46cm9vdA=='})
Check out [Elasticsearch documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html) for more information.
:param batch_size: Maximum number of results for each request.
Limited to 10 values by default. You can increase this limit to decrease retrieval time.
To reduce the pressure on the cluster, you shouldn't set this higher than 1,000.
"""
index = index or self.index
body: dict = {
"size": 0,
"aggs": {"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}]}}},
"aggs": {
"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}], "size": batch_size}}
},
}
if query:
body["query"] = {
Expand Down
9 changes: 8 additions & 1 deletion haystack/document_stores/search_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ def get_metadata_values_by_key(
filters: Optional[FilterType] = None,
index: Optional[str] = None,
headers: Optional[Dict[str, str]] = None,
batch_size: int = 10,
) -> List[dict]:
"""
Get values associated with a metadata key. The output is in the format:
Expand Down Expand Up @@ -323,10 +324,16 @@ def get_metadata_values_by_key(
self.index will be used.
:param headers: Custom HTTP headers to pass to the client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='})
Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information.
:param batch_size: Maximum number of results for each request.
Limited to 10 values by default. You can increase this limit to decrease retrieval time.
To reduce the pressure on the cluster, you shouldn't set this higher than 1,000.
"""
index = index or self.index
body: dict = {
"size": 0,
"aggs": {"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}]}}},
"aggs": {
"metadata_agg": {"composite": {"sources": [{key: {"terms": {"field": key}}}], "size": batch_size}}
},
}
if query:
body["query"] = {
Expand Down
10 changes: 9 additions & 1 deletion test/document_stores/test_search_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pytest
from haystack.document_stores.search_engine import SearchEngineDocumentStore
from haystack.schema import FilterType
from haystack.schema import Document, FilterType


@pytest.mark.unit
Expand Down Expand Up @@ -60,6 +60,14 @@ def test_get_meta_values_by_key(self, ds, documents):
result = ds.get_metadata_values_by_key(key="year", query="Bar")
assert result == [{"count": 3, "value": "2021"}]

@pytest.mark.integration
def test_get_meta_values_by_key_with_batch_size(self, ds):
docs = [Document(f"content_{i}", meta={"name": f"name_{i}"}) for i in range(10_000)]
ds.write_documents(docs)

result = ds.get_metadata_values_by_key(key="name", batch_size=1_000)
assert result == sorted([{"count": 1, "value": f"name_{i}"} for i in range(10_000)], key=lambda x: x["value"])

@pytest.mark.unit
def test_query_return_embedding_true(self, mocked_document_store):
mocked_document_store.return_embedding = True
Expand Down

0 comments on commit 28eda4b

Please sign in to comment.