Remove Document.array field

deepset-ai · Oct 23, 2023 · 75e0156 · 75e0156
1 parent 101bd81
commit 75e0156
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 68 deletions.
diff --git a/haystack/preview/dataclasses/document.py b/haystack/preview/dataclasses/document.py
@@ -38,8 +38,6 @@ def __init__(self, *_, object_hook=None, **__):
         super().__init__(object_hook=object_hook or self.document_decoder)
 
     def document_decoder(self, dictionary):
-        if "array" in dictionary and dictionary.get("array"):
-            dictionary["array"] = numpy.array(dictionary.get("array"))
         if "dataframe" in dictionary and dictionary.get("dataframe"):
             dictionary["dataframe"] = pandas.read_json(dictionary.get("dataframe", None))
 
@@ -55,8 +53,6 @@ class Document:
 
     :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
     :param text: Text of the document, if the document contains text.
-    :param array: Array of numbers associated with the document, if the document contains matrix data like image,
-        audio, video, and such.
     :param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
     :param blob: Binary data associated with the document, if the document has any binary data associated with it.
     :param mime_type: MIME type of the document. Defaults to "text/plain".
@@ -67,7 +63,6 @@ class Document:
 
     id: str = field(default="")
     text: Optional[str] = field(default=None)
-    array: Optional[numpy.ndarray] = field(default=None)
     dataframe: Optional[pandas.DataFrame] = field(default=None)
     blob: Optional[bytes] = field(default=None)
     mime_type: str = field(default="text/plain")
@@ -79,8 +74,6 @@ def __str__(self):
         fields = [f"mimetype: '{self.mime_type}'"]
         if self.text is not None:
             fields.append(f"text: '{self.text}'" if len(self.text) < 100 else f"text: '{self.text[:100]}...'")
-        if self.array is not None:
-            fields.append(f"array: {self.array.shape}")
         if self.dataframe is not None:
             fields.append(f"dataframe: {self.dataframe.shape}")
         if self.blob is not None:
@@ -113,13 +106,12 @@ def _create_id(self):
         Creates a hash of the given content that acts as the document's ID.
         """
         text = self.text or None
-        array = self.array.tolist() if self.array is not None else None
         dataframe = self.dataframe.to_json() if self.dataframe is not None else None
         blob = self.blob or None
         mime_type = self.mime_type or None
         metadata = self.metadata or {}
         embedding = self.embedding if self.embedding is not None else None
-        data = f"{text}{array}{dataframe}{blob}{mime_type}{metadata}{embedding}"
+        data = f"{text}{dataframe}{blob}{mime_type}{metadata}{embedding}"
         return hashlib.sha256(data.encode("utf-8")).hexdigest()
 
     def to_dict(self):

diff --git a/haystack/preview/testing/document_store.py b/haystack/preview/testing/document_store.py
@@ -56,7 +56,6 @@ def filterable_docs(self) -> List[Document]:
                 )
             )
             documents.append(Document(dataframe=pd.DataFrame([i]), metadata={"name": f"table_doc_{i}"}))
-            documents.append(Document(array=np.array([i, i, i]), metadata={"name": f"array_doc_{i}"}))
             documents.append(
                 Document(text=f"Doc {i} with zeros emb", metadata={"name": "zeros_doc"}, embedding=embedding_zero)
             )
@@ -116,23 +115,6 @@ def test_filter_document_text(self, docstore: DocumentStore, filterable_docs: Li
         result = docstore.filter_documents(filters={"text": "A Foo Document 1"})
         assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.text == "A Foo Document 1"])
 
-    @pytest.mark.unit
-    def test_filter_document_array(self, docstore: DocumentStore, filterable_docs: List[Document]):
-        docstore.write_documents(filterable_docs)
-        result = docstore.filter_documents(filters={"array": np.array([1, 1, 1])})
-        assert self.contains_same_docs(
-            result,
-            [
-                doc
-                for doc in filterable_docs
-                if (
-                    doc.array is not None
-                    and doc.array.shape == np.array([1, 1, 1]).shape
-                    and (doc.array == np.array([1, 1, 1])).all()
-                )
-            ],
-        )
-
     @pytest.mark.unit
     def test_filter_document_dataframe(self, docstore: DocumentStore, filterable_docs: List[Document]):
         docstore.write_documents(filterable_docs)

diff --git a/releasenotes/notes/remove-document-array-fe70fd2cbb269add.yaml b/releasenotes/notes/remove-document-array-fe70fd2cbb269add.yaml
@@ -0,0 +1,4 @@
+---
+preview:
+  - |
+    Remove `array` field from `Document` dataclass.
diff --git a/test/preview/dataclasses/test_document.py b/test/preview/dataclasses/test_document.py
@@ -14,7 +14,6 @@
     "doc,doc_str",
     [
         (Document(text="test text"), "text: 'test text'"),
-        (Document(array=np.zeros((3, 7))), "array: (3, 7)"),
         (
             Document(dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"])),
             "dataframe: (2, 2)",
@@ -23,11 +22,10 @@
         (
             Document(
                 text="test text",
-                array=np.zeros((3, 7)),
                 dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"]),
                 blob=bytes("hello, test string".encode("utf-8")),
             ),
-            "text: 'test text', array: (3, 7), dataframe: (2, 2), blob: 18 bytes",
+            "text: 'test text', dataframe: (2, 2), blob: 18 bytes",
         ),
     ],
 )
@@ -82,7 +80,6 @@ def test_empty_document_to_dict():
     assert doc.to_dict() == {
         "id": doc._create_id(),
         "text": None,
-        "array": None,
         "dataframe": None,
         "blob": None,
         "mime_type": "text/plain",
@@ -101,7 +98,6 @@ def test_empty_document_from_dict():
 def test_full_document_to_dict():
     doc = Document(
         text="test text",
-        array=np.array([1, 2, 3]),
         dataframe=pd.DataFrame([10, 20, 30]),
         blob=b"some bytes",
         mime_type="application/pdf",
@@ -111,9 +107,6 @@ def test_full_document_to_dict():
     )
     dictionary = doc.to_dict()
 
-    array = dictionary.pop("array")
-    assert array.shape == doc.array.shape and (array == doc.array).all()
-
     dataframe = dictionary.pop("dataframe")
     assert dataframe.equals(doc.dataframe)
 
@@ -138,7 +131,6 @@ def test_document_with_most_attributes_from_dict():
     assert Document.from_dict(
         {
             "text": "test text",
-            "array": np.array([1, 2, 3]),
             "dataframe": pd.DataFrame([10, 20, 30]),
             "blob": b"some bytes",
             "mime_type": "application/pdf",
@@ -148,7 +140,6 @@ def test_document_with_most_attributes_from_dict():
         }
     ) == Document(
         text="test text",
-        array=np.array([1, 2, 3]),
         dataframe=pd.DataFrame([10, 20, 30]),
         blob=b"some bytes",
         mime_type="application/pdf",
@@ -165,7 +156,6 @@ def test_empty_document_to_json():
         {
             "id": doc.id,
             "text": None,
-            "array": None,
             "dataframe": None,
             "mime_type": "text/plain",
             "metadata": {},
@@ -188,7 +178,6 @@ def __repr__(self):
 
     doc_1 = Document(
         text="test text",
-        array=np.array([1, 2, 3]),
         dataframe=pd.DataFrame([10, 20, 30]),
         blob=b"some bytes",
         mime_type="application/pdf",
@@ -200,7 +189,6 @@ def __repr__(self):
         {
             "id": doc_1.id,
             "text": "test text",
-            "array": [1, 2, 3],
             "dataframe": '{"0":{"0":10,"1":20,"2":30}}',
             "mime_type": "application/pdf",
             "metadata": {"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
@@ -223,7 +211,6 @@ def __eq__(self, other):
         json.dumps(
             {
                 "text": "test text",
-                "array": [1, 2, 3],
                 "dataframe": '{"0":{"0":10,"1":20,"2":30}}',
                 "mime_type": "application/pdf",
                 "metadata": {"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
@@ -234,7 +221,6 @@ def __eq__(self, other):
     )
     assert doc == Document(
         text="test text",
-        array=np.array([1, 2, 3]),
         dataframe=pd.DataFrame([10, 20, 30]),
         blob=None,
         mime_type="application/pdf",
@@ -263,7 +249,6 @@ def default(self, obj):
         {
             "id": doc.id,
             "text": "test text",
-            "array": None,
             "dataframe": None,
             "mime_type": "text/plain",
             "metadata": {"some object": "<<CUSTOM ENCODING>>"},
@@ -298,7 +283,6 @@ def object_hook(self, dictionary):
             {
                 "id": doc.id,
                 "text": "test text",
-                "array": None,
                 "dataframe": None,
                 "mime_type": "text/plain",
                 "metadata": {"some object": "<<CUSTOM ENCODING>>"},
@@ -316,7 +300,6 @@ def test_flatten_document_no_meta():
     assert doc.flatten() == {
         "id": doc.id,
         "text": "test text",
-        "array": None,
         "dataframe": None,
         "blob": None,
         "mime_type": "text/plain",
@@ -331,7 +314,6 @@ def test_flatten_document_with_flat_meta():
     assert doc.flatten() == {
         "id": doc.id,
         "text": "test text",
-        "array": None,
         "dataframe": None,
         "blob": None,
         "mime_type": "text/plain",
@@ -348,7 +330,6 @@ def test_flatten_document_with_nested_meta():
     assert doc.flatten() == {
         "id": doc.id,
         "text": "test text",
-        "array": None,
         "dataframe": None,
         "blob": None,
         "mime_type": "text/plain",

diff --git a/test/preview/document_stores/test_in_memory.py b/test/preview/document_stores/test_in_memory.py
@@ -206,53 +206,40 @@ def test_bm25_retrieval_with_text_and_table_content(self, docstore: DocumentStor
 
     @pytest.mark.unit
     def test_bm25_retrieval_default_filter_for_text_and_dataframes(self, docstore: DocumentStore):
-        docs = [
-            Document(array=np.array([1, 2, 3])),
-            Document(text="Gardening", array=np.array([1, 2, 3])),
-            Document(text="Bird watching"),
-        ]
+        docs = [Document(), Document(text="Gardening"), Document(text="Bird watching")]
         docstore.write_documents(docs)
         results = docstore.bm25_retrieval(query="doesn't matter, top_k is 10", top_k=10)
         assert len(results) == 2
 
     @pytest.mark.unit
     def test_bm25_retrieval_with_filters(self, docstore: DocumentStore):
-        selected_document = Document(text="Gardening", array=np.array([1, 2, 3]), metadata={"selected": True})
-        docs = [Document(array=np.array([1, 2, 3])), selected_document, Document(text="Bird watching")]
+        selected_document = Document(text="Gardening", metadata={"selected": True})
+        docs = [Document(), selected_document, Document(text="Bird watching")]
         docstore.write_documents(docs)
         results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"selected": True})
         assert results == [selected_document]
 
     @pytest.mark.unit
     def test_bm25_retrieval_with_filters_keeps_default_filters(self, docstore: DocumentStore):
-        docs = [
-            Document(array=np.array([1, 2, 3]), metadata={"selected": True}),
-            Document(text="Gardening", array=np.array([1, 2, 3])),
-            Document(text="Bird watching"),
-        ]
+        docs = [Document(metadata={"selected": True}), Document(text="Gardening"), Document(text="Bird watching")]
         docstore.write_documents(docs)
         results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"selected": True})
         assert not len(results)
 
     @pytest.mark.unit
     def test_bm25_retrieval_with_filters_on_text_or_dataframe(self, docstore: DocumentStore):
         document = Document(dataframe=pd.DataFrame({"language": ["Python", "Java"], "use": ["Data Science", "Web"]}))
-        docs = [
-            Document(array=np.array([1, 2, 3])),
-            Document(text="Gardening"),
-            Document(text="Bird watching"),
-            document,
-        ]
+        docs = [Document(), Document(text="Gardening"), Document(text="Bird watching"), document]
         docstore.write_documents(docs)
         results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"text": None})
         assert results == [document]
 
     @pytest.mark.unit
     def test_bm25_retrieval_with_documents_with_mixed_content(self, docstore: DocumentStore):
-        double_document = Document(text="Gardening", array=np.array([1, 2, 3]))
-        docs = [Document(array=np.array([1, 2, 3])), double_document, Document(text="Bird watching")]
+        double_document = Document(text="Gardening", embedding=[1, 2, 3])
+        docs = [Document(embedding=[1, 2, 3]), double_document, Document(text="Bird watching")]
         docstore.write_documents(docs)
-        results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"array": {"$not": None}})
+        results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"embedding": {"$not": None}})
         assert results == [double_document]
 
     @pytest.mark.unit