Skip to content

Commit

Permalink
edit condition
Browse files Browse the repository at this point in the history
  • Loading branch information
Klaijan committed Sep 29, 2023
1 parent b85fbdb commit e934352
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
3 changes: 3 additions & 0 deletions test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,3 +953,6 @@ def test_partition_pdf_uses_model_name():
mockpartition.assert_called_once()
assert "model_name" in mockpartition.call_args.kwargs
assert mockpartition.call_args.kwargs["model_name"]


# def test_partition_pdf_
8 changes: 3 additions & 5 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1091,8 +1091,8 @@ def get_word_bounding_box_from_element(
word = ""
x1, y1, x2, y2 = None, None, None, None
start_index = 0
set_alnum = False
for index, character in enumerate(text_line):
breakpoint()
if isinstance(character, LTChar):
characters.append(character)
char = character.get_text()
Expand All @@ -1106,11 +1106,9 @@ def get_word_bounding_box_from_element(

# TODO(klaijan) - isalnum() only works with A-Z, a-z and 0-9
# will need to switch to some pattern matching once we support more languages
if not set_alnum:
if not word:
isalnum = char.isalnum()
set_alnum = True

if char.isalnum() != isalnum:
if word and char.isalnum() != isalnum:
isalnum = char.isalnum()
words.append(
{"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index},
Expand Down

0 comments on commit e934352

Please sign in to comment.