Skip to content

Commit

Permalink
feat: clean cid
Browse files Browse the repository at this point in the history
  • Loading branch information
christinestraub committed May 8, 2024
1 parent 9a7e8aa commit 5fb5b4c
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
6 changes: 6 additions & 0 deletions unstructured/partition/pdf_image/pdf_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,12 @@ def is_cid_present(text: str) -> bool:
return text.find("(cid:") != -1


def clean_cid(text: str) -> str:
cid_pattern = r"\(cid\:(\d+)\)"
cleaned_text = re.sub(cid_pattern, "", text)
return cleaned_text


def annotate_layout_elements_with_image(
inferred_page_layout: "PageLayout",
extracted_page_layout: Optional["PageLayout"],
Expand Down
5 changes: 4 additions & 1 deletion unstructured/partition/pdf_image/pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pdfminer.utils import open_filename

from unstructured.documents.elements import ElementType
from unstructured.partition.pdf_image.pdf_image_utils import clean_cid
from unstructured.partition.pdf_image.pdfminer_utils import (
get_images_from_pdf_element,
open_pdfminer_pages_generator,
Expand Down Expand Up @@ -66,12 +67,14 @@ def process_data_with_pdfminer(
else:
continue

text = clean_cid(_text) if _text else _text

text_region = element_class.from_coords(
x1 * coef,
y1 * coef,
x2 * coef,
y2 * coef,
text=_text,
text=text,
source=Source.PDFMINER,
)

Expand Down

0 comments on commit 5fb5b4c

Please sign in to comment.