Skip to content

Commit

Permalink
refactor: add constant HTML_MAX_PREDECESSOR_LEN
Browse files Browse the repository at this point in the history
  • Loading branch information
christinestraub committed Apr 2, 2024
1 parent 8a239b3 commit ca5249f
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
9 changes: 7 additions & 2 deletions unstructured/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import sys
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast

from unstructured.partition.utils.constants import HTML_MAX_PREDECESSOR_LEN

if sys.version_info < (3, 8):
from typing_extensions import Final
else:
Expand Down Expand Up @@ -568,7 +570,10 @@ def _unfurl_break_tags(tag_elem: etree._Element) -> List[etree._Element]:
return unfurled


def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool:
def _is_text_tag(
tag_elem: etree._Element,
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
) -> bool:
"""True when `tag_element` potentially contains narrative text."""
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
# it could be the text representation of a giant div
Expand All @@ -594,7 +599,7 @@ def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool

def _process_list_item(
tag_elem: etree._Element,
max_predecessor_len: int = 5,
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
) -> Tuple[Optional[Element], Optional[etree._Element]]:
"""Produces an `HTMLListItem` document element from `tag_elem`.
Expand Down
2 changes: 2 additions & 0 deletions unstructured/partition/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,5 @@ class PartitionStrategy:

# default image colors
IMAGE_COLOR_DEPTH = 32

HTML_MAX_PREDECESSOR_LEN = 5

0 comments on commit ca5249f

Please sign in to comment.