Merge branch 'main' into docemb

deepset-ai · Oct 23, 2023 · cd59aad · cd59aad
2 parents 11070ef + 101bd81
commit cd59aad
Show file tree

Hide file tree

Showing 11 changed files with 118 additions and 97 deletions.
diff --git a/haystack/preview/components/file_converters/azure.py b/haystack/preview/components/file_converters/azure.py
@@ -1,5 +1,6 @@
 from pathlib import Path
-from typing import List, Union, Dict, Any
+from typing import List, Union, Dict, Any, Optional
+import os
 
 from haystack.preview.lazy_imports import LazyImport
 from haystack.preview import component, Document, default_to_dict
@@ -22,22 +23,33 @@ class AzureOCRDocumentConverter:
     to set up your resource.
     """
 
-    def __init__(self, endpoint: str, api_key: str, model_id: str = "prebuilt-read"):
+    def __init__(self, endpoint: str, api_key: Optional[str] = None, model_id: str = "prebuilt-read"):
         """
         Create an AzureOCRDocumentConverter component.
 
         :param endpoint: The endpoint of your Azure resource.
-        :param api_key: The key of your Azure resource.
+        :param api_key: The key of your Azure resource. It can be
+        explicitly provided or automatically read from the
+        environment variable AZURE_AI_API_KEY (recommended).
         :param model_id: The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
             for a list of available models. Default: `"prebuilt-read"`.
         """
         azure_import.check()
 
+        if api_key is None:
+            try:
+                api_key = os.environ["AZURE_AI_API_KEY"]
+            except KeyError as e:
+                raise ValueError(
+                    "AzureOCRDocumentConverter expects an Azure Credential key. "
+                    "Set the AZURE_AI_API_KEY environment variable (recommended) or pass it explicitly."
+                ) from e
+
+        self.api_key = api_key
         self.document_analysis_client = DocumentAnalysisClient(
             endpoint=endpoint, credential=AzureKeyCredential(api_key)
         )
         self.endpoint = endpoint
-        self.api_key = api_key
         self.model_id = model_id
 
     @component.output_types(documents=List[Document], azure=List[Dict])
@@ -70,7 +82,7 @@ def to_dict(self) -> Dict[str, Any]:
         """
         Serialize this component to a dictionary.
         """
-        return default_to_dict(self, endpoint=self.endpoint, api_key=self.api_key, model_id=self.model_id)
+        return default_to_dict(self, endpoint=self.endpoint, model_id=self.model_id)
 
     @staticmethod
     def _convert_azure_result_to_document(result: "AnalyzeResult", file_suffix: str) -> Document:

diff --git a/haystack/preview/components/websearch/serper_dev.py b/haystack/preview/components/websearch/serper_dev.py
@@ -1,4 +1,5 @@
 import json
+import os
 import logging
 from typing import Dict, List, Optional, Any
 
@@ -26,20 +27,29 @@ class SerperDevWebSearch:
 
     def __init__(
         self,
-        api_key: str,
+        api_key: Optional[str] = None,
         top_k: Optional[int] = 10,
         allowed_domains: Optional[List[str]] = None,
         search_params: Optional[Dict[str, Any]] = None,
     ):
         """
-        :param api_key: API key for the SerperDev API.
+        :param api_key: API key for the SerperDev API.  It can be
+        explicitly provided or automatically read from the
+        environment variable SERPERDEV_API_KEY (recommended).
         :param top_k: Number of documents to return.
         :param allowed_domains: List of domains to limit the search to.
         :param search_params: Additional parameters passed to the SerperDev API.
         For example, you can set 'num' to 20 to increase the number of search results.
         See the [Serper Dev website](https://serper.dev/) for more details.
         """
         if api_key is None:
+            try:
+                api_key = os.environ["SERPERDEV_API_KEY"]
+            except KeyError as e:
+                raise ValueError(
+                    "SerperDevWebSearch expects an API key. "
+                    "Set the SERPERDEV_API_KEY environment variable (recommended) or pass it explicitly."
+                ) from e
             raise ValueError("API key for SerperDev API must be set.")
         self.api_key = api_key
         self.top_k = top_k
@@ -51,11 +61,7 @@ def to_dict(self) -> Dict[str, Any]:
         Serialize this component to a dictionary.
         """
         return default_to_dict(
-            self,
-            api_key=self.api_key,
-            top_k=self.top_k,
-            allowed_domains=self.allowed_domains,
-            search_params=self.search_params,
+            self, top_k=self.top_k, allowed_domains=self.allowed_domains, search_params=self.search_params
         )
 
     @component.output_types(documents=List[Document], links=List[str])

diff --git a/haystack/preview/dataclasses/document.py b/haystack/preview/dataclasses/document.py
@@ -3,7 +3,7 @@
 import logging
 from dataclasses import asdict, dataclass, field, fields
 from pathlib import Path
-from typing import Any, Dict, Optional, Type
+from typing import Any, Dict, List, Optional, Type
 
 import numpy
 import pandas
@@ -42,8 +42,6 @@ def document_decoder(self, dictionary):
             dictionary["array"] = numpy.array(dictionary.get("array"))
         if "dataframe" in dictionary and dictionary.get("dataframe"):
             dictionary["dataframe"] = pandas.read_json(dictionary.get("dataframe", None))
-        if "embedding" in dictionary and dictionary.get("embedding"):
-            dictionary["embedding"] = numpy.array(dictionary.get("embedding"))
 
         return dictionary
 
@@ -75,7 +73,7 @@ class Document:
     mime_type: str = field(default="text/plain")
     metadata: Dict[str, Any] = field(default_factory=dict)
     score: Optional[float] = field(default=None)
-    embedding: Optional[numpy.ndarray] = field(default=None, repr=False)
+    embedding: Optional[List[float]] = field(default=None, repr=False)
 
     def __str__(self):
         fields = [f"mimetype: '{self.mime_type}'"]
@@ -120,7 +118,7 @@ def _create_id(self):
         blob = self.blob or None
         mime_type = self.mime_type or None
         metadata = self.metadata or {}
-        embedding = self.embedding.tolist() if self.embedding is not None else None
+        embedding = self.embedding if self.embedding is not None else None
         data = f"{text}{array}{dataframe}{blob}{mime_type}{metadata}{embedding}"
         return hashlib.sha256(data.encode("utf-8")).hexdigest()
 

diff --git a/haystack/preview/testing/document_store.py b/haystack/preview/testing/document_store.py
@@ -1,5 +1,6 @@
 # pylint: disable=too-many-public-methods
 from typing import List
+import random
 
 import pytest
 import numpy as np
@@ -11,37 +12,41 @@
 from haystack.preview.errors import FilterError
 
 
+def _random_embeddings(n):
+    return [random.random() for _ in range(n)]
+
+
 class DocumentStoreBaseTests:
     @pytest.fixture
     def docstore(self) -> DocumentStore:
         raise NotImplementedError()
 
     @pytest.fixture
     def filterable_docs(self) -> List[Document]:
-        embedding_zero = np.zeros(768).astype(np.float32)
-        embedding_one = np.ones(768).astype(np.float32)
+        embedding_zero = [0.0] * 768
+        embedding_one = [1.0] * 768
 
         documents = []
         for i in range(3):
             documents.append(
                 Document(
                     text=f"A Foo Document {i}",
                     metadata={"name": f"name_{i}", "page": "100", "chapter": "intro", "number": 2},
-                    embedding=np.random.rand(768).astype(np.float32),
+                    embedding=_random_embeddings(768),
                 )
             )
             documents.append(
                 Document(
                     text=f"A Bar Document {i}",
                     metadata={"name": f"name_{i}", "page": "123", "chapter": "abstract", "number": -2},
-                    embedding=np.random.rand(768).astype(np.float32),
+                    embedding=_random_embeddings(768),
                 )
             )
             documents.append(
                 Document(
                     text=f"A Foobar Document {i}",
                     metadata={"name": f"name_{i}", "page": "90", "chapter": "conclusion", "number": -10},
-                    embedding=np.random.rand(768).astype(np.float32),
+                    embedding=_random_embeddings(768),
                 )
             )
             documents.append(
@@ -209,11 +214,9 @@ def test_eq_filter_table(self, docstore: DocumentStore, filterable_docs: List[Do
     @pytest.mark.unit
     def test_eq_filter_embedding(self, docstore: DocumentStore, filterable_docs: List[Document]):
         docstore.write_documents(filterable_docs)
-        embedding = np.zeros(768).astype(np.float32)
+        embedding = [0.0] * 768
         result = docstore.filter_documents(filters={"embedding": embedding})
-        assert self.contains_same_docs(
-            result, [doc for doc in filterable_docs if np.array_equal(embedding, doc.embedding)]  # type: ignore
-        )
+        assert self.contains_same_docs(result, [doc for doc in filterable_docs if embedding == doc.embedding])
 
     @pytest.mark.unit
     def test_in_filter_explicit(self, docstore: DocumentStore, filterable_docs: List[Document]):
@@ -248,17 +251,12 @@ def test_in_filter_table(self, docstore: DocumentStore, filterable_docs: List[Do
     @pytest.mark.unit
     def test_in_filter_embedding(self, docstore: DocumentStore, filterable_docs: List[Document]):
         docstore.write_documents(filterable_docs)
-        embedding_zero = np.zeros(768, np.float32)
-        embedding_one = np.ones(768, np.float32)
+        embedding_zero = [0.0] * 768
+        embedding_one = [1.0] * 768
         result = docstore.filter_documents(filters={"embedding": {"$in": [embedding_zero, embedding_one]}})
         assert self.contains_same_docs(
             result,
-            [
-                doc
-                for doc in filterable_docs
-                if isinstance(doc.embedding, np.ndarray)
-                and (np.array_equal(embedding_zero, doc.embedding) or np.array_equal(embedding_one, doc.embedding))
-            ],
+            [doc for doc in filterable_docs if (embedding_zero == doc.embedding or embedding_one == doc.embedding)],
         )
 
     @pytest.mark.unit

diff --git a/releasenotes/notes/document-embedding-type-d66c44ac6878fbdd.yaml b/releasenotes/notes/document-embedding-type-d66c44ac6878fbdd.yaml
@@ -0,0 +1,4 @@
+---
+preview:
+  - |
+    Change `Document`'s `embedding` field type from `numpy.ndarray` to `List[float]`
diff --git a/releasenotes/notes/remove-api-key-from-serialization-2474a1539b86e233.yaml b/releasenotes/notes/remove-api-key-from-serialization-2474a1539b86e233.yaml
@@ -0,0 +1,4 @@
+---
+preview:
+  - |
+    Remove "api_key" from serialization of AzureOCRDocumentConverter and SerperDevWebSearch.
diff --git a/test/preview/components/file_converters/test_azure_ocr_doc_converter.py b/test/preview/components/file_converters/test_azure_ocr_doc_converter.py
@@ -7,17 +7,19 @@
 
 
 class TestAzureOCRDocumentConverter:
+    @pytest.mark.unit
+    def test_init_fail_wo_api_key(self, monkeypatch):
+        monkeypatch.delenv("AZURE_AI_API_KEY", raising=False)
+        with pytest.raises(ValueError, match="AzureOCRDocumentConverter expects an Azure Credential key"):
+            AzureOCRDocumentConverter(endpoint="test_endpoint")
+
     @pytest.mark.unit
     def test_to_dict(self):
         component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
         data = component.to_dict()
         assert data == {
             "type": "AzureOCRDocumentConverter",
-            "init_parameters": {
-                "api_key": "test_credential_key",
-                "endpoint": "test_endpoint",
-                "model_id": "prebuilt-read",
-            },
+            "init_parameters": {"endpoint": "test_endpoint", "model_id": "prebuilt-read"},
         }
 
     @pytest.mark.unit

diff --git a/test/preview/components/retrievers/test_in_memory_embedding_retriever.py b/test/preview/components/retrievers/test_in_memory_embedding_retriever.py
@@ -118,9 +118,9 @@ def test_valid_run(self):
         top_k = 3
         ds = InMemoryDocumentStore(embedding_similarity_function="cosine")
         docs = [
-            Document(text="my document", embedding=np.array([0.1, 0.2, 0.3, 0.4])),
-            Document(text="another document", embedding=np.array([1.0, 1.0, 1.0, 1.0])),
-            Document(text="third document", embedding=np.array([0.5, 0.7, 0.5, 0.7])),
+            Document(text="my document", embedding=[0.1, 0.2, 0.3, 0.4]),
+            Document(text="another document", embedding=[1.0, 1.0, 1.0, 1.0]),
+            Document(text="third document", embedding=[0.5, 0.7, 0.5, 0.7]),
         ]
         ds.write_documents(docs)
 
@@ -142,17 +142,17 @@ def test_run_with_pipeline(self):
         ds = InMemoryDocumentStore(embedding_similarity_function="cosine")
         top_k = 2
         docs = [
-            Document(text="my document", embedding=np.array([0.1, 0.2, 0.3, 0.4])),
-            Document(text="another document", embedding=np.array([1.0, 1.0, 1.0, 1.0])),
-            Document(text="third document", embedding=np.array([0.5, 0.7, 0.5, 0.7])),
+            Document(text="my document", embedding=[0.1, 0.2, 0.3, 0.4]),
+            Document(text="another document", embedding=[1.0, 1.0, 1.0, 1.0]),
+            Document(text="third document", embedding=[0.5, 0.7, 0.5, 0.7]),
         ]
         ds.write_documents(docs)
         retriever = InMemoryEmbeddingRetriever(ds, top_k=top_k)
 
         pipeline = Pipeline()
         pipeline.add_component("retriever", retriever)
         result: Dict[str, Any] = pipeline.run(
-            data={"retriever": {"query_embedding": np.array([0.1, 0.1, 0.1, 0.1]), "return_embedding": True}}
+            data={"retriever": {"query_embedding": [0.1, 0.1, 0.1, 0.1], "return_embedding": True}}
         )
 
         assert result

diff --git a/test/preview/components/websearch/test_serperdev.py b/test/preview/components/websearch/test_serperdev.py
@@ -108,6 +108,12 @@ def mock_serper_dev_search_result():
 
 
 class TestSerperDevSearchAPI:
+    @pytest.mark.unit
+    def test_init_fail_wo_api_key(self, monkeypatch):
+        monkeypatch.delenv("SERPERDEV_API_KEY", raising=False)
+        with pytest.raises(ValueError, match="SerperDevWebSearch expects an API key"):
+            SerperDevWebSearch()
+
     @pytest.mark.unit
     def test_to_dict(self):
         component = SerperDevWebSearch(
@@ -116,12 +122,7 @@ def test_to_dict(self):
         data = component.to_dict()
         assert data == {
             "type": "SerperDevWebSearch",
-            "init_parameters": {
-                "api_key": "test_key",
-                "top_k": 10,
-                "allowed_domains": ["test.com"],
-                "search_params": {"param": "test"},
-            },
+            "init_parameters": {"top_k": 10, "allowed_domains": ["test.com"], "search_params": {"param": "test"}},
         }
 
     @pytest.mark.unit

diff --git a/test/preview/dataclasses/test_document.py b/test/preview/dataclasses/test_document.py
@@ -71,8 +71,8 @@ def __eq__(self, other):
                 return True
 
     foo = TestObject()
-    doc1 = Document(text="test text", metadata={"value": np.array([0, 1, 2]), "path": Path("."), "obj": foo})
-    doc2 = Document(text="test text", metadata={"value": np.array([0, 1, 2]), "path": Path("."), "obj": foo})
+    doc1 = Document(text="test text", metadata={"value": [0, 1, 2], "path": Path("."), "obj": foo})
+    doc2 = Document(text="test text", metadata={"value": [0, 1, 2], "path": Path("."), "obj": foo})
     assert doc1 == doc2
 
 
@@ -107,7 +107,7 @@ def test_full_document_to_dict():
         mime_type="application/pdf",
         metadata={"some": "values", "test": 10},
         score=0.99,
-        embedding=np.zeros([10, 10]),
+        embedding=[10, 10],
     )
     dictionary = doc.to_dict()
 
@@ -121,7 +121,7 @@ def test_full_document_to_dict():
     assert blob == doc.blob
 
     embedding = dictionary.pop("embedding")
-    assert (embedding == doc.embedding).all()
+    assert embedding == doc.embedding
 
     assert dictionary == {
         "id": doc.id,
@@ -134,7 +134,7 @@ def test_full_document_to_dict():
 
 @pytest.mark.unit
 def test_document_with_most_attributes_from_dict():
-    embedding = np.zeros([10, 10])
+    embedding = [10, 10]
     assert Document.from_dict(
         {
             "text": "test text",
@@ -194,7 +194,7 @@ def __repr__(self):
         mime_type="application/pdf",
         metadata={"some object": TestClass(), "a path": tmp_path / "test.txt"},
         score=0.5,
-        embedding=np.array([1, 2, 3, 4]),
+        embedding=[1, 2, 3, 4],
     )
     assert doc_1.to_json() == json.dumps(
         {
@@ -241,7 +241,7 @@ def __eq__(self, other):
         # Note the object serialization
         metadata={"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
         score=0.5,
-        embedding=np.array([1, 2, 3, 4]),
+        embedding=[1, 2, 3, 4],
     )