Skip to content

Commit

Permalink
Merge branch 'deepset-ai:main' into docemb
Browse files Browse the repository at this point in the history
  • Loading branch information
shag1802 authored Oct 23, 2023
2 parents 95af3c5 + 8f28928 commit 11070ef
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 28 deletions.
17 changes: 2 additions & 15 deletions haystack/preview/dataclasses/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
from dataclasses import asdict, dataclass, field, fields
from pathlib import Path
from typing import Any, Dict, List, Optional, Type
from typing import Any, Dict, Optional, Type

import numpy
import pandas
Expand Down Expand Up @@ -48,33 +48,21 @@ def document_decoder(self, dictionary):
return dictionary


def id_hash_keys_default_factory():
"""
Default factory for the id_hash_keys field of the Document dataclass.
We need a callable instead of a default value, because mutable default values are not allowed.
"""
return ["text", "array", "dataframe", "blob"]


@dataclass
class Document:
"""
Base data class containing some data to be queried.
Can contain text snippets, tables, and file paths to images or audios.
Documents can be sorted by score and saved to/from dictionary and JSON.
:param id: Unique identifier for the document. When not set, it's generated based on the document's attributes (see id_hash_keys).
:param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
:param text: Text of the document, if the document contains text.
:param array: Array of numbers associated with the document, if the document contains matrix data like image,
audio, video, and such.
:param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
:param blob: Binary data associated with the document, if the document has any binary data associated with it.
:param mime_type: MIME type of the document. Defaults to "text/plain".
:param metadata: Additional custom metadata for the document.
:param id_hash_keys: List of keys to use for the ID hash. Defaults to the four content fields of the document:
text, array, dataframe and blob. This field can include other document fields (like mime_type) and metadata's
top-level keys. Note that the order of the keys is important: the ID hash will be generated by concatenating
the values of the keys in the order they appear in this list. Changing the order impacts the ID hash.
:param score: Score of the document. Used for ranking, usually assigned by retrievers.
:param embedding: Vector representation of the document.
"""
Expand All @@ -86,7 +74,6 @@ class Document:
blob: Optional[bytes] = field(default=None)
mime_type: str = field(default="text/plain")
metadata: Dict[str, Any] = field(default_factory=dict)
id_hash_keys: List[str] = field(default_factory=id_hash_keys_default_factory, hash=False)
score: Optional[float] = field(default=None)
embedding: Optional[numpy.ndarray] = field(default=None, repr=False)

Expand Down
2 changes: 1 addition & 1 deletion releasenotes/notes/doc-id-rework-85e82b5359282f7e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
preview:
- |
Rework `Document.id` generation, if an `id` is not explicitly set it's generated
using all `Document` field' values, `score` and `id_hash_keys` are not used.
using all `Document` field' values, `score` is not used.
2 changes: 1 addition & 1 deletion releasenotes/notes/refactor-openai-document-embedder.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
---
enhancements:
preview:
- |
Refactor OpenAIDocumentEmbedder to enrich documents with embeddings instead of recreating them.
12 changes: 12 additions & 0 deletions releasenotes/notes/remove-id-hash-document-93e4a589b3fd2aad.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
preview:
- |
Remove `id_hash_keys` field from `Document` dataclass.
`id_hash_keys` has been also removed from Components that were using it:
* `DocumentCleaner`
* `TextDocumentSplitter`
* `PyPDFToDocument`
* `AzureOCRDocumentConverter`
* `HTMLToDocument`
* `TextFileToDocument`
* `TikaDocumentConverter`
12 changes: 1 addition & 11 deletions test/preview/dataclasses/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ def test_empty_document_to_dict():
"blob": None,
"mime_type": "text/plain",
"metadata": {},
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": None,
"embedding": None,
}
Expand All @@ -107,7 +106,6 @@ def test_full_document_to_dict():
blob=b"some bytes",
mime_type="application/pdf",
metadata={"some": "values", "test": 10},
id_hash_keys=["text", "array", "dataframe", "blob"],
score=0.99,
embedding=np.zeros([10, 10]),
)
Expand All @@ -130,7 +128,6 @@ def test_full_document_to_dict():
"text": "test text",
"mime_type": "application/pdf",
"metadata": {"some": "values", "test": 10},
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": 0.99,
}

Expand Down Expand Up @@ -172,7 +169,6 @@ def test_empty_document_to_json():
"dataframe": None,
"mime_type": "text/plain",
"metadata": {},
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": None,
"embedding": None,
}
Expand All @@ -197,7 +193,6 @@ def __repr__(self):
blob=b"some bytes",
mime_type="application/pdf",
metadata={"some object": TestClass(), "a path": tmp_path / "test.txt"},
id_hash_keys=["text", "array", "dataframe", "blob"],
score=0.5,
embedding=np.array([1, 2, 3, 4]),
)
Expand All @@ -209,7 +204,6 @@ def __repr__(self):
"dataframe": '{"0":{"0":10,"1":20,"2":30}}',
"mime_type": "application/pdf",
"metadata": {"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": 0.5,
"embedding": [1, 2, 3, 4],
}
Expand Down Expand Up @@ -252,7 +246,7 @@ def __eq__(self, other):


@pytest.mark.unit
def test_to_json_custom_encoder(tmp_path):
def test_to_json_custom_encoder():
class SerializableTestClass:
...

Expand All @@ -273,7 +267,6 @@ def default(self, obj):
"dataframe": None,
"mime_type": "text/plain",
"metadata": {"some object": "<<CUSTOM ENCODING>>"},
"id_hash_keys": ["text", "array", "dataframe", "blob"],
"score": None,
"embedding": None,
},
Expand Down Expand Up @@ -329,7 +322,6 @@ def test_flatten_document_no_meta():
"mime_type": "text/plain",
"score": None,
"embedding": None,
"id_hash_keys": ["text", "array", "dataframe", "blob"],
}


Expand All @@ -347,7 +339,6 @@ def test_flatten_document_with_flat_meta():
"embedding": None,
"some-key": "a value",
"another-key": "another value!",
"id_hash_keys": ["text", "array", "dataframe", "blob"],
}


Expand All @@ -365,5 +356,4 @@ def test_flatten_document_with_nested_meta():
"embedding": None,
"some-key": "a value",
"nested": {"key": 10, "key2": 50},
"id_hash_keys": ["text", "array", "dataframe", "blob"],
}

0 comments on commit 11070ef

Please sign in to comment.