From 75e015605c95988f904c1691bffa01b10f728f45 Mon Sep 17 00:00:00 2001 From: Silvano Cerza Date: Fri, 20 Oct 2023 17:31:35 +0200 Subject: [PATCH] Remove Document.array field --- haystack/preview/dataclasses/document.py | 10 +------ haystack/preview/testing/document_store.py | 18 ------------ ...emove-document-array-fe70fd2cbb269add.yaml | 4 +++ test/preview/dataclasses/test_document.py | 21 +------------- .../preview/document_stores/test_in_memory.py | 29 +++++-------------- 5 files changed, 14 insertions(+), 68 deletions(-) create mode 100644 releasenotes/notes/remove-document-array-fe70fd2cbb269add.yaml diff --git a/haystack/preview/dataclasses/document.py b/haystack/preview/dataclasses/document.py index 9c0edb9b52..3ee8150d5f 100644 --- a/haystack/preview/dataclasses/document.py +++ b/haystack/preview/dataclasses/document.py @@ -38,8 +38,6 @@ def __init__(self, *_, object_hook=None, **__): super().__init__(object_hook=object_hook or self.document_decoder) def document_decoder(self, dictionary): - if "array" in dictionary and dictionary.get("array"): - dictionary["array"] = numpy.array(dictionary.get("array")) if "dataframe" in dictionary and dictionary.get("dataframe"): dictionary["dataframe"] = pandas.read_json(dictionary.get("dataframe", None)) @@ -55,8 +53,6 @@ class Document: :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values. :param text: Text of the document, if the document contains text. - :param array: Array of numbers associated with the document, if the document contains matrix data like image, - audio, video, and such. :param dataframe: Pandas dataframe with the document's content, if the document contains tabular data. :param blob: Binary data associated with the document, if the document has any binary data associated with it. :param mime_type: MIME type of the document. Defaults to "text/plain". @@ -67,7 +63,6 @@ class Document: id: str = field(default="") text: Optional[str] = field(default=None) - array: Optional[numpy.ndarray] = field(default=None) dataframe: Optional[pandas.DataFrame] = field(default=None) blob: Optional[bytes] = field(default=None) mime_type: str = field(default="text/plain") @@ -79,8 +74,6 @@ def __str__(self): fields = [f"mimetype: '{self.mime_type}'"] if self.text is not None: fields.append(f"text: '{self.text}'" if len(self.text) < 100 else f"text: '{self.text[:100]}...'") - if self.array is not None: - fields.append(f"array: {self.array.shape}") if self.dataframe is not None: fields.append(f"dataframe: {self.dataframe.shape}") if self.blob is not None: @@ -113,13 +106,12 @@ def _create_id(self): Creates a hash of the given content that acts as the document's ID. """ text = self.text or None - array = self.array.tolist() if self.array is not None else None dataframe = self.dataframe.to_json() if self.dataframe is not None else None blob = self.blob or None mime_type = self.mime_type or None metadata = self.metadata or {} embedding = self.embedding if self.embedding is not None else None - data = f"{text}{array}{dataframe}{blob}{mime_type}{metadata}{embedding}" + data = f"{text}{dataframe}{blob}{mime_type}{metadata}{embedding}" return hashlib.sha256(data.encode("utf-8")).hexdigest() def to_dict(self): diff --git a/haystack/preview/testing/document_store.py b/haystack/preview/testing/document_store.py index 68a5b2e528..14bfeac89f 100644 --- a/haystack/preview/testing/document_store.py +++ b/haystack/preview/testing/document_store.py @@ -56,7 +56,6 @@ def filterable_docs(self) -> List[Document]: ) ) documents.append(Document(dataframe=pd.DataFrame([i]), metadata={"name": f"table_doc_{i}"})) - documents.append(Document(array=np.array([i, i, i]), metadata={"name": f"array_doc_{i}"})) documents.append( Document(text=f"Doc {i} with zeros emb", metadata={"name": "zeros_doc"}, embedding=embedding_zero) ) @@ -116,23 +115,6 @@ def test_filter_document_text(self, docstore: DocumentStore, filterable_docs: Li result = docstore.filter_documents(filters={"text": "A Foo Document 1"}) assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.text == "A Foo Document 1"]) - @pytest.mark.unit - def test_filter_document_array(self, docstore: DocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"array": np.array([1, 1, 1])}) - assert self.contains_same_docs( - result, - [ - doc - for doc in filterable_docs - if ( - doc.array is not None - and doc.array.shape == np.array([1, 1, 1]).shape - and (doc.array == np.array([1, 1, 1])).all() - ) - ], - ) - @pytest.mark.unit def test_filter_document_dataframe(self, docstore: DocumentStore, filterable_docs: List[Document]): docstore.write_documents(filterable_docs) diff --git a/releasenotes/notes/remove-document-array-fe70fd2cbb269add.yaml b/releasenotes/notes/remove-document-array-fe70fd2cbb269add.yaml new file mode 100644 index 0000000000..64a326e21d --- /dev/null +++ b/releasenotes/notes/remove-document-array-fe70fd2cbb269add.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Remove `array` field from `Document` dataclass. diff --git a/test/preview/dataclasses/test_document.py b/test/preview/dataclasses/test_document.py index ef426f5817..e43b77fbc8 100644 --- a/test/preview/dataclasses/test_document.py +++ b/test/preview/dataclasses/test_document.py @@ -14,7 +14,6 @@ "doc,doc_str", [ (Document(text="test text"), "text: 'test text'"), - (Document(array=np.zeros((3, 7))), "array: (3, 7)"), ( Document(dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"])), "dataframe: (2, 2)", @@ -23,11 +22,10 @@ ( Document( text="test text", - array=np.zeros((3, 7)), dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"]), blob=bytes("hello, test string".encode("utf-8")), ), - "text: 'test text', array: (3, 7), dataframe: (2, 2), blob: 18 bytes", + "text: 'test text', dataframe: (2, 2), blob: 18 bytes", ), ], ) @@ -82,7 +80,6 @@ def test_empty_document_to_dict(): assert doc.to_dict() == { "id": doc._create_id(), "text": None, - "array": None, "dataframe": None, "blob": None, "mime_type": "text/plain", @@ -101,7 +98,6 @@ def test_empty_document_from_dict(): def test_full_document_to_dict(): doc = Document( text="test text", - array=np.array([1, 2, 3]), dataframe=pd.DataFrame([10, 20, 30]), blob=b"some bytes", mime_type="application/pdf", @@ -111,9 +107,6 @@ def test_full_document_to_dict(): ) dictionary = doc.to_dict() - array = dictionary.pop("array") - assert array.shape == doc.array.shape and (array == doc.array).all() - dataframe = dictionary.pop("dataframe") assert dataframe.equals(doc.dataframe) @@ -138,7 +131,6 @@ def test_document_with_most_attributes_from_dict(): assert Document.from_dict( { "text": "test text", - "array": np.array([1, 2, 3]), "dataframe": pd.DataFrame([10, 20, 30]), "blob": b"some bytes", "mime_type": "application/pdf", @@ -148,7 +140,6 @@ def test_document_with_most_attributes_from_dict(): } ) == Document( text="test text", - array=np.array([1, 2, 3]), dataframe=pd.DataFrame([10, 20, 30]), blob=b"some bytes", mime_type="application/pdf", @@ -165,7 +156,6 @@ def test_empty_document_to_json(): { "id": doc.id, "text": None, - "array": None, "dataframe": None, "mime_type": "text/plain", "metadata": {}, @@ -188,7 +178,6 @@ def __repr__(self): doc_1 = Document( text="test text", - array=np.array([1, 2, 3]), dataframe=pd.DataFrame([10, 20, 30]), blob=b"some bytes", mime_type="application/pdf", @@ -200,7 +189,6 @@ def __repr__(self): { "id": doc_1.id, "text": "test text", - "array": [1, 2, 3], "dataframe": '{"0":{"0":10,"1":20,"2":30}}', "mime_type": "application/pdf", "metadata": {"some object": "", "a path": str((tmp_path / "test.txt").absolute())}, @@ -223,7 +211,6 @@ def __eq__(self, other): json.dumps( { "text": "test text", - "array": [1, 2, 3], "dataframe": '{"0":{"0":10,"1":20,"2":30}}', "mime_type": "application/pdf", "metadata": {"some object": "", "a path": str((tmp_path / "test.txt").absolute())}, @@ -234,7 +221,6 @@ def __eq__(self, other): ) assert doc == Document( text="test text", - array=np.array([1, 2, 3]), dataframe=pd.DataFrame([10, 20, 30]), blob=None, mime_type="application/pdf", @@ -263,7 +249,6 @@ def default(self, obj): { "id": doc.id, "text": "test text", - "array": None, "dataframe": None, "mime_type": "text/plain", "metadata": {"some object": "<>"}, @@ -298,7 +283,6 @@ def object_hook(self, dictionary): { "id": doc.id, "text": "test text", - "array": None, "dataframe": None, "mime_type": "text/plain", "metadata": {"some object": "<>"}, @@ -316,7 +300,6 @@ def test_flatten_document_no_meta(): assert doc.flatten() == { "id": doc.id, "text": "test text", - "array": None, "dataframe": None, "blob": None, "mime_type": "text/plain", @@ -331,7 +314,6 @@ def test_flatten_document_with_flat_meta(): assert doc.flatten() == { "id": doc.id, "text": "test text", - "array": None, "dataframe": None, "blob": None, "mime_type": "text/plain", @@ -348,7 +330,6 @@ def test_flatten_document_with_nested_meta(): assert doc.flatten() == { "id": doc.id, "text": "test text", - "array": None, "dataframe": None, "blob": None, "mime_type": "text/plain", diff --git a/test/preview/document_stores/test_in_memory.py b/test/preview/document_stores/test_in_memory.py index 465b91a125..473600c88a 100644 --- a/test/preview/document_stores/test_in_memory.py +++ b/test/preview/document_stores/test_in_memory.py @@ -206,30 +206,22 @@ def test_bm25_retrieval_with_text_and_table_content(self, docstore: DocumentStor @pytest.mark.unit def test_bm25_retrieval_default_filter_for_text_and_dataframes(self, docstore: DocumentStore): - docs = [ - Document(array=np.array([1, 2, 3])), - Document(text="Gardening", array=np.array([1, 2, 3])), - Document(text="Bird watching"), - ] + docs = [Document(), Document(text="Gardening"), Document(text="Bird watching")] docstore.write_documents(docs) results = docstore.bm25_retrieval(query="doesn't matter, top_k is 10", top_k=10) assert len(results) == 2 @pytest.mark.unit def test_bm25_retrieval_with_filters(self, docstore: DocumentStore): - selected_document = Document(text="Gardening", array=np.array([1, 2, 3]), metadata={"selected": True}) - docs = [Document(array=np.array([1, 2, 3])), selected_document, Document(text="Bird watching")] + selected_document = Document(text="Gardening", metadata={"selected": True}) + docs = [Document(), selected_document, Document(text="Bird watching")] docstore.write_documents(docs) results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"selected": True}) assert results == [selected_document] @pytest.mark.unit def test_bm25_retrieval_with_filters_keeps_default_filters(self, docstore: DocumentStore): - docs = [ - Document(array=np.array([1, 2, 3]), metadata={"selected": True}), - Document(text="Gardening", array=np.array([1, 2, 3])), - Document(text="Bird watching"), - ] + docs = [Document(metadata={"selected": True}), Document(text="Gardening"), Document(text="Bird watching")] docstore.write_documents(docs) results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"selected": True}) assert not len(results) @@ -237,22 +229,17 @@ def test_bm25_retrieval_with_filters_keeps_default_filters(self, docstore: Docum @pytest.mark.unit def test_bm25_retrieval_with_filters_on_text_or_dataframe(self, docstore: DocumentStore): document = Document(dataframe=pd.DataFrame({"language": ["Python", "Java"], "use": ["Data Science", "Web"]})) - docs = [ - Document(array=np.array([1, 2, 3])), - Document(text="Gardening"), - Document(text="Bird watching"), - document, - ] + docs = [Document(), Document(text="Gardening"), Document(text="Bird watching"), document] docstore.write_documents(docs) results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"text": None}) assert results == [document] @pytest.mark.unit def test_bm25_retrieval_with_documents_with_mixed_content(self, docstore: DocumentStore): - double_document = Document(text="Gardening", array=np.array([1, 2, 3])) - docs = [Document(array=np.array([1, 2, 3])), double_document, Document(text="Bird watching")] + double_document = Document(text="Gardening", embedding=[1, 2, 3]) + docs = [Document(embedding=[1, 2, 3]), double_document, Document(text="Bird watching")] docstore.write_documents(docs) - results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"array": {"$not": None}}) + results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"embedding": {"$not": None}}) assert results == [double_document] @pytest.mark.unit