chore: Update docstring and type of fuzziness (#1243)

* Update docstring and type of fuzziness * Add test
deepset-ai · Dec 12, 2024 · 63f20c0 · 63f20c0
1 parent 31d14a1
commit 63f20c0
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 7 deletions.
diff --git a/...s/opensearch/src/haystack_integrations/components/retrievers/opensearch/bm25_retriever.py b/...s/opensearch/src/haystack_integrations/components/retrievers/opensearch/bm25_retriever.py
@@ -27,7 +27,7 @@ def __init__(
         *,
         document_store: OpenSearchDocumentStore,
         filters: Optional[Dict[str, Any]] = None,
-        fuzziness: str = "AUTO",
+        fuzziness: Union[int, str] = "AUTO",
         top_k: int = 10,
         scale_score: bool = False,
         all_terms_must_match: bool = False,
@@ -40,8 +40,14 @@ def __init__(
 
         :param document_store: An instance of OpenSearchDocumentStore to use with the Retriever.
         :param filters: Filters to narrow down the search for documents in the Document Store.
-        :param fuzziness: Fuzziness parameter for full-text queries to apply approximate string matching.
-        For more information, see [OpenSearch fuzzy query](https://opensearch.org/docs/latest/query-dsl/term/fuzzy/).
+        :param fuzziness: Determines how approximate string matching is applied in full-text queries.
+            This parameter sets the number of character edits (insertions, deletions, or substitutions)
+            required to transform one word into another. For example, the "fuzziness" between the words
+            "wined" and "wind" is 1 because only one edit is needed to match them.
+
+            Use "AUTO" (the default) for automatic adjustment based on term length, which is optimal for
+            most scenarios. For detailed guidance, refer to the
+            [OpenSearch fuzzy query documentation](https://opensearch.org/docs/latest/query-dsl/term/fuzzy/).
         :param top_k: Maximum number of documents to return.
         :param scale_score: If `True`, scales the score of retrieved documents to a range between 0 and 1.
             This is useful when comparing documents across different indexes.
@@ -153,7 +159,7 @@ def run(
         filters: Optional[Dict[str, Any]] = None,
         all_terms_must_match: Optional[bool] = None,
         top_k: Optional[int] = None,
-        fuzziness: Optional[str] = None,
+        fuzziness: Optional[Union[int, str]] = None,
         scale_score: Optional[bool] = None,
         custom_query: Optional[Dict[str, Any]] = None,
     ):

diff --git a/...rations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/...rations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py
@@ -340,7 +340,7 @@ def _bm25_retrieval(
         query: str,
         *,
         filters: Optional[Dict[str, Any]] = None,
-        fuzziness: str = "AUTO",
+        fuzziness: Union[int, str] = "AUTO",
         top_k: int = 10,
         scale_score: bool = False,
         all_terms_must_match: bool = False,
@@ -357,8 +357,14 @@ def _bm25_retrieval(
 
         :param query: String to search in saved Documents' text.
         :param filters: Optional filters to narrow down the search space.
-        :param fuzziness: Fuzziness parameter passed to OpenSearch, defaults to "AUTO". see the official documentation
-                          for valid [fuzziness values](https://www.elastic.co/guide/en/OpenSearch/reference/current/common-options.html#fuzziness)
+        :param fuzziness: Determines how approximate string matching is applied in full-text queries.
+            This parameter sets the number of character edits (insertions, deletions, or substitutions)
+            required to transform one word into another. For example, the "fuzziness" between the words
+            "wined" and "wind" is 1 because only one edit is needed to match them.
+
+            Use "AUTO" (the default) for automatic adjustment based on term length, which is optimal for
+            most scenarios. For detailed guidance, refer to the
+            [OpenSearch fuzzy query documentation](https://opensearch.org/docs/latest/query-dsl/term/fuzzy/).
         :param top_k: Maximum number of Documents to return, defaults to 10
         :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
         :param all_terms_must_match: If `True` all terms in `query` must be present in the Document, defaults to False

diff --git a/integrations/opensearch/tests/test_bm25_retriever.py b/integrations/opensearch/tests/test_bm25_retriever.py
@@ -121,6 +121,35 @@ def test_from_dict(_mock_opensearch_client):
     assert retriever._filter_policy == FilterPolicy.REPLACE
 
 
+@patch("haystack_integrations.document_stores.opensearch.document_store.OpenSearch")
+def test_from_dict_not_defaults(_mock_opensearch_client):
+    data = {
+        "type": "haystack_integrations.components.retrievers.opensearch.bm25_retriever.OpenSearchBM25Retriever",
+        "init_parameters": {
+            "document_store": {
+                "init_parameters": {"hosts": "some fake host", "index": "default"},
+                "type": "haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore",
+            },
+            "filters": {},
+            "fuzziness": 0,
+            "top_k": 15,
+            "scale_score": True,
+            "filter_policy": "replace",
+            "custom_query": {"some": "custom query"},
+            "raise_on_failure": True,
+        },
+    }
+    retriever = OpenSearchBM25Retriever.from_dict(data)
+    assert retriever._document_store
+    assert retriever._filters == {}
+    assert retriever._fuzziness == 0
+    assert retriever._top_k == 15
+    assert retriever._scale_score
+    assert retriever._filter_policy == FilterPolicy.REPLACE
+    assert retriever._custom_query == {"some": "custom query"}
+    assert retriever._raise_on_failure is True
+
+
 def test_run():
     mock_store = Mock(spec=OpenSearchDocumentStore)
     mock_store._bm25_retrieval.return_value = [Document(content="Test doc")]