Merge branch 'deepset-ai:main' into docemb

deepset-ai · Oct 23, 2023 · 11070ef · 11070ef
2 parents 95af3c5 + 8f28928
commit 11070ef
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 28 deletions.
diff --git a/haystack/preview/dataclasses/document.py b/haystack/preview/dataclasses/document.py
@@ -3,7 +3,7 @@
 import logging
 from dataclasses import asdict, dataclass, field, fields
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, Optional, Type
 
 import numpy
 import pandas
@@ -48,33 +48,21 @@ def document_decoder(self, dictionary):
         return dictionary
 
 
-def id_hash_keys_default_factory():
-    """
-    Default factory for the id_hash_keys field of the Document dataclass.
-    We need a callable instead of a default value, because mutable default values are not allowed.
-    """
-    return ["text", "array", "dataframe", "blob"]
-
-
 @dataclass
 class Document:
     """
     Base data class containing some data to be queried.
     Can contain text snippets, tables, and file paths to images or audios.
     Documents can be sorted by score and saved to/from dictionary and JSON.
 
-    :param id: Unique identifier for the document. When not set, it's generated based on the document's attributes (see id_hash_keys).
+    :param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
     :param text: Text of the document, if the document contains text.
     :param array: Array of numbers associated with the document, if the document contains matrix data like image,
         audio, video, and such.
     :param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
     :param blob: Binary data associated with the document, if the document has any binary data associated with it.
     :param mime_type: MIME type of the document. Defaults to "text/plain".
     :param metadata: Additional custom metadata for the document.
-    :param id_hash_keys: List of keys to use for the ID hash. Defaults to the four content fields of the document:
-        text, array, dataframe and blob. This field can include other document fields (like mime_type) and metadata's
-        top-level keys. Note that the order of the keys is important: the ID hash will be generated by concatenating
-        the values of the keys in the order they appear in this list. Changing the order impacts the ID hash.
     :param score: Score of the document. Used for ranking, usually assigned by retrievers.
     :param embedding: Vector representation of the document.
     """
@@ -86,7 +74,6 @@ class Document:
     blob: Optional[bytes] = field(default=None)
     mime_type: str = field(default="text/plain")
     metadata: Dict[str, Any] = field(default_factory=dict)
-    id_hash_keys: List[str] = field(default_factory=id_hash_keys_default_factory, hash=False)
     score: Optional[float] = field(default=None)
     embedding: Optional[numpy.ndarray] = field(default=None, repr=False)
 

diff --git a/releasenotes/notes/doc-id-rework-85e82b5359282f7e.yaml b/releasenotes/notes/doc-id-rework-85e82b5359282f7e.yaml
@@ -2,4 +2,4 @@
 preview:
   - |
     Rework `Document.id` generation, if an `id` is not explicitly set it's generated
-    using all `Document` field' values, `score` and `id_hash_keys` are not used.
+    using all `Document` field' values, `score` is not used.
diff --git a/releasenotes/notes/refactor-openai-document-embedder.yaml b/releasenotes/notes/refactor-openai-document-embedder.yaml
@@ -1,4 +1,4 @@
 ---
-enhancements:
+preview:
   - |
     Refactor OpenAIDocumentEmbedder to enrich documents with embeddings instead of recreating them.
diff --git a/releasenotes/notes/remove-id-hash-document-93e4a589b3fd2aad.yaml b/releasenotes/notes/remove-id-hash-document-93e4a589b3fd2aad.yaml
@@ -0,0 +1,12 @@
+---
+preview:
+  - |
+    Remove `id_hash_keys` field from `Document` dataclass.
+    `id_hash_keys` has been also removed from Components that were using it:
+    * `DocumentCleaner`
+    * `TextDocumentSplitter`
+    * `PyPDFToDocument`
+    * `AzureOCRDocumentConverter`
+    * `HTMLToDocument`
+    * `TextFileToDocument`
+    * `TikaDocumentConverter`
diff --git a/test/preview/dataclasses/test_document.py b/test/preview/dataclasses/test_document.py
@@ -87,7 +87,6 @@ def test_empty_document_to_dict():
         "blob": None,
         "mime_type": "text/plain",
         "metadata": {},
-        "id_hash_keys": ["text", "array", "dataframe", "blob"],
         "score": None,
         "embedding": None,
     }
@@ -107,7 +106,6 @@ def test_full_document_to_dict():
         blob=b"some bytes",
         mime_type="application/pdf",
         metadata={"some": "values", "test": 10},
-        id_hash_keys=["text", "array", "dataframe", "blob"],
         score=0.99,
         embedding=np.zeros([10, 10]),
     )
@@ -130,7 +128,6 @@ def test_full_document_to_dict():
         "text": "test text",
         "mime_type": "application/pdf",
         "metadata": {"some": "values", "test": 10},
-        "id_hash_keys": ["text", "array", "dataframe", "blob"],
         "score": 0.99,
     }
 
@@ -172,7 +169,6 @@ def test_empty_document_to_json():
             "dataframe": None,
             "mime_type": "text/plain",
             "metadata": {},
-            "id_hash_keys": ["text", "array", "dataframe", "blob"],
             "score": None,
             "embedding": None,
         }
@@ -197,7 +193,6 @@ def __repr__(self):
         blob=b"some bytes",
         mime_type="application/pdf",
         metadata={"some object": TestClass(), "a path": tmp_path / "test.txt"},
-        id_hash_keys=["text", "array", "dataframe", "blob"],
         score=0.5,
         embedding=np.array([1, 2, 3, 4]),
     )
@@ -209,7 +204,6 @@ def __repr__(self):
             "dataframe": '{"0":{"0":10,"1":20,"2":30}}',
             "mime_type": "application/pdf",
             "metadata": {"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
-            "id_hash_keys": ["text", "array", "dataframe", "blob"],
             "score": 0.5,
             "embedding": [1, 2, 3, 4],
         }
@@ -252,7 +246,7 @@ def __eq__(self, other):
 
 
 @pytest.mark.unit
-def test_to_json_custom_encoder(tmp_path):
+def test_to_json_custom_encoder():
     class SerializableTestClass:
         ...
 
@@ -273,7 +267,6 @@ def default(self, obj):
             "dataframe": None,
             "mime_type": "text/plain",
             "metadata": {"some object": "<<CUSTOM ENCODING>>"},
-            "id_hash_keys": ["text", "array", "dataframe", "blob"],
             "score": None,
             "embedding": None,
         },
@@ -329,7 +322,6 @@ def test_flatten_document_no_meta():
         "mime_type": "text/plain",
         "score": None,
         "embedding": None,
-        "id_hash_keys": ["text", "array", "dataframe", "blob"],
     }
 
 
@@ -347,7 +339,6 @@ def test_flatten_document_with_flat_meta():
         "embedding": None,
         "some-key": "a value",
         "another-key": "another value!",
-        "id_hash_keys": ["text", "array", "dataframe", "blob"],
     }
 
 
@@ -365,5 +356,4 @@ def test_flatten_document_with_nested_meta():
         "embedding": None,
         "some-key": "a value",
         "nested": {"key": 10, "key2": 50},
-        "id_hash_keys": ["text", "array", "dataframe", "blob"],
     }