Skip to content

Commit

Permalink
image alt support (#3797)
Browse files Browse the repository at this point in the history
  • Loading branch information
plutasnyy authored Nov 26, 2024
1 parent 626f73a commit e48d79e
Show file tree
Hide file tree
Showing 10 changed files with 192 additions and 12 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## 0.16.7

### Enhancements
- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags

### Features

### Fixes

## 0.16.6

### Enhancements
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
<header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d">
<img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/>
<img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/>
</header>
</div>
</body>
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
[
("html_files/example.html", "unstructured_json_output/example.json"),
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
(
"html_files/example_with_alternative_text.html",
"unstructured_json_output/example_with_alternative_text.json",
),
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
(
"html_files/example_with_inline_fields.html",
Expand All @@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
html_file_path = Path(__file__).parent / html_file_path
json_file_path = Path(__file__).parent / json_file_path

expected_json_elements = elements_from_json(str(json_file_path))
html_code = html_file_path.read_text()

predicted_elements = partition_html(
text=html_code, html_parser_version="v2", unique_element_ids=True
)

assert len(expected_json_elements) == len(predicted_elements)

for i in range(len(expected_json_elements)):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
[
{
"element_id": "3a6b156a81764e17be128264241f8136",
"metadata": {
"category_depth": 0,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "897a8a47377c4ad6aab839a929879537",
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
},
"text": "",
"type": "UncategorizedText"
},
{
"element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"metadata": {
"category_depth": 1,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "3a6b156a81764e17be128264241f8136",
"text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />"
},
"text": "",
"type": "UncategorizedText"
},
{
"element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
"metadata": {
"category_depth": 2,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />"
},
"text": "New York logo",
"type": "Image"
},
{
"element_id": "40c32fd8-9a02-42b8-a587-884293881090",
"metadata": {
"category_depth": 2,
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
"text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />"
},
"text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
"type": "Image"
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
assert len(unstructured_elements) == 2
assert isinstance(unstructured_elements[0], Text)
assert isinstance(unstructured_elements[1], NarrativeText)


def test_alternate_text_from_image_is_passed():
# language=HTML
input_html = """
<div class="Page">
<table>
<tr>
<td rowspan="2">Example image nested in the table:</td>
<td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td>
</tr>
</table>
</div>add_img_alt_text
"""
page = parse_html_to_ontology(input_html)
unstructured_elements = ontology_to_unstructured_elements(page)
assert len(unstructured_elements) == 2
assert "ALT TEXT Logo" in unstructured_elements[1].text
47 changes: 47 additions & 0 deletions test_unstructured/partition/html/test_partition_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from unstructured.partition.html import partition_html


def test_alternative_image_text_can_be_included():
# language=HTML
html = """
<div class="Page">
<img src="my-logo.png" alt="ALT TEXT Logo"/>
</div>
"""
_, image_to_text_alt_mode = partition_html(
text=html,
image_alt_mode="to_text",
html_parser_version="v2",
)
assert "ALT TEXT Logo" in image_to_text_alt_mode.text

_, image_none_alt_mode = partition_html(
text=html,
image_alt_mode=None,
html_parser_version="v2",
)
assert "ALT TEXT Logo" not in image_none_alt_mode.text


def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
# language=HTML
html = """
<div class="Page">
<p class="Paragraph">
<img src="my-logo.png" alt="ALT TEXT Logo"/>
</p>
</div>
"""
_, paragraph_to_text_alt_mode = partition_html(
text=html,
image_alt_mode="to_text",
html_parser_version="v2",
)
assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text

_, paragraph_none_alt_mode = partition_html(
text=html,
image_alt_mode=None,
html_parser_version="v2",
)
assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.6" # pragma: no cover
__version__ = "0.16.7" # pragma: no cover
26 changes: 21 additions & 5 deletions unstructured/documents/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,27 @@ def to_html(self, add_children=True) -> str:

return result_html

def to_text(self, add_children=True) -> str:
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
"""
Returns the text representation of the element.
Args:
add_children: If True, the text of the children will be included.
Otherwise, element is represented as single self-closing tag.
add_img_alt_text: If True, the alt text of the image will be included.
"""
if self.children and add_children:
children_text = " ".join(child.to_text().strip() for child in self.children)
children_text = " ".join(
child.to_text(add_children, add_img_alt_text).strip() for child in self.children
)
return children_text
return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()

text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()

if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
text += f" {self.additional_attributes.get('alt', '')}"

return text.strip()

def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join(
Expand Down Expand Up @@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
allowed_tags: List[str] = Field(["input"], frozen=True)

def to_text(self, add_children=True) -> str:
text = super().to_text()
def to_text(self, add_children=True, add_img_alt_text=True) -> str:
text = super().to_text(add_children, add_img_alt_text)
value = self.additional_attributes.get("value", "")
if not value:
return text
Expand Down
16 changes: 15 additions & 1 deletion unstructured/partition/html/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def partition_html(
skip_headers_and_footers: bool = False,
detection_origin: Optional[str] = None,
html_parser_version: Literal["v1", "v2"] = "v1",
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
**kwargs: Any,
) -> list[Element]:
"""Partitions an HTML document into its constituent elements.
Expand Down Expand Up @@ -65,6 +66,9 @@ def partition_html(
html_parser_version (Literal['v1', 'v2']):
The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
use the ontology schema to parse the HTML document.
image_alt_mode (Literal['to_text']):
When set 'to_text', the v2 parser will include the alternative text of images in the output.
"""
# -- parser rejects an empty str, nip that edge-case in the bud here --
if text is not None and text.strip() == "" and not file and not filename and not url:
Expand All @@ -81,6 +85,7 @@ def partition_html(
skip_headers_and_footers=skip_headers_and_footers,
detection_origin=detection_origin,
html_parser_version=html_parser_version,
image_alt_mode=image_alt_mode,
)

return list(_HtmlPartitioner.iter_elements(opts))
Expand All @@ -102,6 +107,7 @@ def __init__(
skip_headers_and_footers: bool,
detection_origin: str | None,
html_parser_version: Literal["v1", "v2"] = "v1",
image_alt_mode: Optional[Literal["to_text"]] = "to_text",
):
self._file_path = file_path
self._file = file
Expand All @@ -113,6 +119,7 @@ def __init__(
self._skip_headers_and_footers = skip_headers_and_footers
self._detection_origin = detection_origin
self._html_parser_version = html_parser_version
self._image_alt_mode = image_alt_mode

@lazyproperty
def detection_origin(self) -> str | None:
Expand Down Expand Up @@ -172,6 +179,11 @@ def html_parser_version(self) -> Literal["v1", "v2"]:
"""When html_parser_version=='v2', HTML elements follow ontology schema."""
return self._html_parser_version

@lazyproperty
def add_img_alt_text(self) -> bool:
"""When True, the alternative text of images is included in the output."""
return self._image_alt_mode == "to_text"


class _HtmlPartitioner:
"""Partition HTML document into document-elements."""
Expand Down Expand Up @@ -239,5 +251,7 @@ def _from_ontology(self) -> List[Element]:
"""Convert an ontology elements represented in HTML to an ontology element."""
html_text = self._opts.html_text
ontology = parse_html_to_ontology(html_text)
unstructured_elements = ontology_to_unstructured_elements(ontology)
unstructured_elements = ontology_to_unstructured_elements(
ontology, add_img_alt_text=self._opts.add_img_alt_text
)
return unstructured_elements
10 changes: 6 additions & 4 deletions unstructured/partition/html/transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
page_number: int = None,
depth: int = 0,
filename: str | None = None,
add_img_alt_text: bool = True,
) -> list[elements.Element]:
"""
Converts an OntologyElement object to a list of unstructured Element objects.
Expand All @@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
parent_id (str, optional): The ID of the parent element. Defaults to None.
page_number (int, optional): The page number of the element. Defaults to None.
depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
filename (str, optional): The name of the file the element comes from. Defaults to None.
add_img_alt_text (bool): Whether to include the alternative text of images
in the output. Defaults to True.
Returns:
list[Element]: A list of unstructured Element objects.
"""
Expand Down Expand Up @@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
page_number=page_number,
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
filename=filename,
add_img_alt_text=add_img_alt_text,
)
children += child

Expand All @@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
else:
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
html_code_of_ontology_element = ontology_element.to_html()
element_text = ontology_element.to_text()
element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)

unstructured_element = element_class(
text=element_text,
Expand Down Expand Up @@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
Args:
html_code (str): The HTML code to be parsed.
Parsing HTML will start from <div class="Page">.
Returns:
OntologyElement: The parsed Element object.
Expand Down Expand Up @@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
Args:
soup (Tag): The BeautifulSoup Tag object to be converted.
recursion_depth (int): Flag to control limit of recursion depth.
Returns:
OntologyElement: The converted OntologyElement object.
"""
Expand Down

0 comments on commit e48d79e

Please sign in to comment.