diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 4af1a64aff..44078ea821 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -12,7 +12,7 @@ import re import uuid from types import MappingProxyType -from typing import Any, Callable, FrozenSet, Optional, Sequence, cast +from typing import Any, Callable, FrozenSet, List, Optional, Sequence, cast from typing_extensions import ParamSpec, TypeAlias, TypedDict @@ -32,12 +32,6 @@ class NoID(abc.ABC): """Class to indicate that an element do not have an ID.""" -class HashValue(str): - """Class to indicate that an element has a hash value assigned to its ID.""" - - pass - - class UUID(abc.ABC): """Class to indicate that an element should have a UUID.""" @@ -510,8 +504,6 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: _P = ParamSpec("_P") -from typing import List - def calculate_hash(text: str, page_number: int, index_in_sequence: int) -> str: """ @@ -817,35 +809,16 @@ def __init__( self.text: str = text self.embeddings: Optional[list[float]] = embeddings - if isinstance(element_id, NoID): - self.id = self._calculate_hash() - elif isinstance(element_id, UUID): - self.id = uuid.uuid4() - elif isinstance(element_id, str): - self.id = element_id - else: - raise ValueError("ID must be a string, UUID, or NoID") + element_id = str(uuid.uuid4()) super().__init__( - element_id=self.id, + element_id=element_id, metadata=metadata, coordinates=coordinates, coordinate_system=coordinate_system, detection_origin=detection_origin, ) - def _calculate_hash(self, index_in_sequence: int = 0) -> HashValue: - """Calculate the hash depending on element's text and index in sequence. - - Args: - index_in_sequence: Index of the element in the sequence of all elements. - - Returns: - HashValue - 128-bit hash value of the element. - """ - data = f"{self.text}" - return HashValue(hashlib.sha256(data.encode()).hexdigest()[:32]) - def __eq__(self, other: object): if not isinstance(other, Text): return False diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 589c9835e7..244323d35e 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -593,7 +593,6 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: metadata_kwargs = { kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html") } - # NOTE (yao): do not use cast here as cast(None) still is None if not str(kwargs.get("model_name", "")).startswith("chipper"): # NOTE(alan): Skip hierarchy if using chipper, as it should take care of that diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 698bbcd7d3..f15c73bb2e 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -19,9 +19,7 @@ Element, ElementMetadata, ElementType, - HashValue, ListItem, - NoID, PageBreak, Text, Title,