Skip to content

Commit

Permalink
Remove Document.array field
Browse files Browse the repository at this point in the history
  • Loading branch information
silvanocerza committed Oct 23, 2023
1 parent 101bd81 commit 75e0156
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 68 deletions.
10 changes: 1 addition & 9 deletions haystack/preview/dataclasses/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ def __init__(self, *_, object_hook=None, **__):
super().__init__(object_hook=object_hook or self.document_decoder)

def document_decoder(self, dictionary):
if "array" in dictionary and dictionary.get("array"):
dictionary["array"] = numpy.array(dictionary.get("array"))
if "dataframe" in dictionary and dictionary.get("dataframe"):
dictionary["dataframe"] = pandas.read_json(dictionary.get("dataframe", None))

Expand All @@ -55,8 +53,6 @@ class Document:
:param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
:param text: Text of the document, if the document contains text.
:param array: Array of numbers associated with the document, if the document contains matrix data like image,
audio, video, and such.
:param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
:param blob: Binary data associated with the document, if the document has any binary data associated with it.
:param mime_type: MIME type of the document. Defaults to "text/plain".
Expand All @@ -67,7 +63,6 @@ class Document:

id: str = field(default="")
text: Optional[str] = field(default=None)
array: Optional[numpy.ndarray] = field(default=None)
dataframe: Optional[pandas.DataFrame] = field(default=None)
blob: Optional[bytes] = field(default=None)
mime_type: str = field(default="text/plain")
Expand All @@ -79,8 +74,6 @@ def __str__(self):
fields = [f"mimetype: '{self.mime_type}'"]
if self.text is not None:
fields.append(f"text: '{self.text}'" if len(self.text) < 100 else f"text: '{self.text[:100]}...'")
if self.array is not None:
fields.append(f"array: {self.array.shape}")
if self.dataframe is not None:
fields.append(f"dataframe: {self.dataframe.shape}")
if self.blob is not None:
Expand Down Expand Up @@ -113,13 +106,12 @@ def _create_id(self):
Creates a hash of the given content that acts as the document's ID.
"""
text = self.text or None
array = self.array.tolist() if self.array is not None else None
dataframe = self.dataframe.to_json() if self.dataframe is not None else None
blob = self.blob or None
mime_type = self.mime_type or None
metadata = self.metadata or {}
embedding = self.embedding if self.embedding is not None else None
data = f"{text}{array}{dataframe}{blob}{mime_type}{metadata}{embedding}"
data = f"{text}{dataframe}{blob}{mime_type}{metadata}{embedding}"
return hashlib.sha256(data.encode("utf-8")).hexdigest()

def to_dict(self):
Expand Down
18 changes: 0 additions & 18 deletions haystack/preview/testing/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def filterable_docs(self) -> List[Document]:
)
)
documents.append(Document(dataframe=pd.DataFrame([i]), metadata={"name": f"table_doc_{i}"}))
documents.append(Document(array=np.array([i, i, i]), metadata={"name": f"array_doc_{i}"}))
documents.append(
Document(text=f"Doc {i} with zeros emb", metadata={"name": "zeros_doc"}, embedding=embedding_zero)
)
Expand Down Expand Up @@ -116,23 +115,6 @@ def test_filter_document_text(self, docstore: DocumentStore, filterable_docs: Li
result = docstore.filter_documents(filters={"text": "A Foo Document 1"})
assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.text == "A Foo Document 1"])

@pytest.mark.unit
def test_filter_document_array(self, docstore: DocumentStore, filterable_docs: List[Document]):
docstore.write_documents(filterable_docs)
result = docstore.filter_documents(filters={"array": np.array([1, 1, 1])})
assert self.contains_same_docs(
result,
[
doc
for doc in filterable_docs
if (
doc.array is not None
and doc.array.shape == np.array([1, 1, 1]).shape
and (doc.array == np.array([1, 1, 1])).all()
)
],
)

@pytest.mark.unit
def test_filter_document_dataframe(self, docstore: DocumentStore, filterable_docs: List[Document]):
docstore.write_documents(filterable_docs)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
preview:
- |
Remove `array` field from `Document` dataclass.
21 changes: 1 addition & 20 deletions test/preview/dataclasses/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
"doc,doc_str",
[
(Document(text="test text"), "text: 'test text'"),
(Document(array=np.zeros((3, 7))), "array: (3, 7)"),
(
Document(dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"])),
"dataframe: (2, 2)",
Expand All @@ -23,11 +22,10 @@
(
Document(
text="test text",
array=np.zeros((3, 7)),
dataframe=pd.DataFrame([["John", 25], ["Martha", 34]], columns=["name", "age"]),
blob=bytes("hello, test string".encode("utf-8")),
),
"text: 'test text', array: (3, 7), dataframe: (2, 2), blob: 18 bytes",
"text: 'test text', dataframe: (2, 2), blob: 18 bytes",
),
],
)
Expand Down Expand Up @@ -82,7 +80,6 @@ def test_empty_document_to_dict():
assert doc.to_dict() == {
"id": doc._create_id(),
"text": None,
"array": None,
"dataframe": None,
"blob": None,
"mime_type": "text/plain",
Expand All @@ -101,7 +98,6 @@ def test_empty_document_from_dict():
def test_full_document_to_dict():
doc = Document(
text="test text",
array=np.array([1, 2, 3]),
dataframe=pd.DataFrame([10, 20, 30]),
blob=b"some bytes",
mime_type="application/pdf",
Expand All @@ -111,9 +107,6 @@ def test_full_document_to_dict():
)
dictionary = doc.to_dict()

array = dictionary.pop("array")
assert array.shape == doc.array.shape and (array == doc.array).all()

dataframe = dictionary.pop("dataframe")
assert dataframe.equals(doc.dataframe)

Expand All @@ -138,7 +131,6 @@ def test_document_with_most_attributes_from_dict():
assert Document.from_dict(
{
"text": "test text",
"array": np.array([1, 2, 3]),
"dataframe": pd.DataFrame([10, 20, 30]),
"blob": b"some bytes",
"mime_type": "application/pdf",
Expand All @@ -148,7 +140,6 @@ def test_document_with_most_attributes_from_dict():
}
) == Document(
text="test text",
array=np.array([1, 2, 3]),
dataframe=pd.DataFrame([10, 20, 30]),
blob=b"some bytes",
mime_type="application/pdf",
Expand All @@ -165,7 +156,6 @@ def test_empty_document_to_json():
{
"id": doc.id,
"text": None,
"array": None,
"dataframe": None,
"mime_type": "text/plain",
"metadata": {},
Expand All @@ -188,7 +178,6 @@ def __repr__(self):

doc_1 = Document(
text="test text",
array=np.array([1, 2, 3]),
dataframe=pd.DataFrame([10, 20, 30]),
blob=b"some bytes",
mime_type="application/pdf",
Expand All @@ -200,7 +189,6 @@ def __repr__(self):
{
"id": doc_1.id,
"text": "test text",
"array": [1, 2, 3],
"dataframe": '{"0":{"0":10,"1":20,"2":30}}',
"mime_type": "application/pdf",
"metadata": {"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
Expand All @@ -223,7 +211,6 @@ def __eq__(self, other):
json.dumps(
{
"text": "test text",
"array": [1, 2, 3],
"dataframe": '{"0":{"0":10,"1":20,"2":30}}',
"mime_type": "application/pdf",
"metadata": {"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
Expand All @@ -234,7 +221,6 @@ def __eq__(self, other):
)
assert doc == Document(
text="test text",
array=np.array([1, 2, 3]),
dataframe=pd.DataFrame([10, 20, 30]),
blob=None,
mime_type="application/pdf",
Expand Down Expand Up @@ -263,7 +249,6 @@ def default(self, obj):
{
"id": doc.id,
"text": "test text",
"array": None,
"dataframe": None,
"mime_type": "text/plain",
"metadata": {"some object": "<<CUSTOM ENCODING>>"},
Expand Down Expand Up @@ -298,7 +283,6 @@ def object_hook(self, dictionary):
{
"id": doc.id,
"text": "test text",
"array": None,
"dataframe": None,
"mime_type": "text/plain",
"metadata": {"some object": "<<CUSTOM ENCODING>>"},
Expand All @@ -316,7 +300,6 @@ def test_flatten_document_no_meta():
assert doc.flatten() == {
"id": doc.id,
"text": "test text",
"array": None,
"dataframe": None,
"blob": None,
"mime_type": "text/plain",
Expand All @@ -331,7 +314,6 @@ def test_flatten_document_with_flat_meta():
assert doc.flatten() == {
"id": doc.id,
"text": "test text",
"array": None,
"dataframe": None,
"blob": None,
"mime_type": "text/plain",
Expand All @@ -348,7 +330,6 @@ def test_flatten_document_with_nested_meta():
assert doc.flatten() == {
"id": doc.id,
"text": "test text",
"array": None,
"dataframe": None,
"blob": None,
"mime_type": "text/plain",
Expand Down
29 changes: 8 additions & 21 deletions test/preview/document_stores/test_in_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,53 +206,40 @@ def test_bm25_retrieval_with_text_and_table_content(self, docstore: DocumentStor

@pytest.mark.unit
def test_bm25_retrieval_default_filter_for_text_and_dataframes(self, docstore: DocumentStore):
docs = [
Document(array=np.array([1, 2, 3])),
Document(text="Gardening", array=np.array([1, 2, 3])),
Document(text="Bird watching"),
]
docs = [Document(), Document(text="Gardening"), Document(text="Bird watching")]
docstore.write_documents(docs)
results = docstore.bm25_retrieval(query="doesn't matter, top_k is 10", top_k=10)
assert len(results) == 2

@pytest.mark.unit
def test_bm25_retrieval_with_filters(self, docstore: DocumentStore):
selected_document = Document(text="Gardening", array=np.array([1, 2, 3]), metadata={"selected": True})
docs = [Document(array=np.array([1, 2, 3])), selected_document, Document(text="Bird watching")]
selected_document = Document(text="Gardening", metadata={"selected": True})
docs = [Document(), selected_document, Document(text="Bird watching")]
docstore.write_documents(docs)
results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"selected": True})
assert results == [selected_document]

@pytest.mark.unit
def test_bm25_retrieval_with_filters_keeps_default_filters(self, docstore: DocumentStore):
docs = [
Document(array=np.array([1, 2, 3]), metadata={"selected": True}),
Document(text="Gardening", array=np.array([1, 2, 3])),
Document(text="Bird watching"),
]
docs = [Document(metadata={"selected": True}), Document(text="Gardening"), Document(text="Bird watching")]
docstore.write_documents(docs)
results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"selected": True})
assert not len(results)

@pytest.mark.unit
def test_bm25_retrieval_with_filters_on_text_or_dataframe(self, docstore: DocumentStore):
document = Document(dataframe=pd.DataFrame({"language": ["Python", "Java"], "use": ["Data Science", "Web"]}))
docs = [
Document(array=np.array([1, 2, 3])),
Document(text="Gardening"),
Document(text="Bird watching"),
document,
]
docs = [Document(), Document(text="Gardening"), Document(text="Bird watching"), document]
docstore.write_documents(docs)
results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"text": None})
assert results == [document]

@pytest.mark.unit
def test_bm25_retrieval_with_documents_with_mixed_content(self, docstore: DocumentStore):
double_document = Document(text="Gardening", array=np.array([1, 2, 3]))
docs = [Document(array=np.array([1, 2, 3])), double_document, Document(text="Bird watching")]
double_document = Document(text="Gardening", embedding=[1, 2, 3])
docs = [Document(embedding=[1, 2, 3]), double_document, Document(text="Bird watching")]
docstore.write_documents(docs)
results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"array": {"$not": None}})
results = docstore.bm25_retrieval(query="Java", top_k=10, filters={"embedding": {"$not": None}})
assert results == [double_document]

@pytest.mark.unit
Expand Down

0 comments on commit 75e0156

Please sign in to comment.