diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index a29b9ea034..34ee19c5df 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -12,6 +12,7 @@ import re import uuid from itertools import groupby +from pathlib import Path from types import MappingProxyType from typing import Any, Callable, FrozenSet, Optional, Sequence, cast @@ -763,7 +764,7 @@ def id_to_hash(self, sequence_number: int) -> str: Returns: new ID value """ - data = f"{self.metadata.filename}{self.text}{self.metadata.page_number}{sequence_number}" + data = f"{Path(self.metadata.filename).stem}{self.text}{self.metadata.page_number}{sequence_number}" self._element_id = hashlib.sha256(data.encode()).hexdigest()[:32] return self.id