diff --git a/CHANGELOG.md b/CHANGELOG.md
index 26a3d559f2..8bcfb375d2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## 0.16.7
+
+### Enhancements
+- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from html tags
+
+### Features
+
+### Fixes
+
## 0.16.6
### Enhancements
diff --git a/test_unstructured/documents/html_files/example_with_alternative_text.html b/test_unstructured/documents/html_files/example_with_alternative_text.html
new file mode 100644
index 0000000000..64cbf2f605
--- /dev/null
+++ b/test_unstructured/documents/html_files/example_with_alternative_text.html
@@ -0,0 +1,8 @@
+
+
+
+
+
diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
index 72dec7d02a..f489be0363 100644
--- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
+++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
@@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
[
("html_files/example.html", "unstructured_json_output/example.json"),
("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
+ (
+ "html_files/example_with_alternative_text.html",
+ "unstructured_json_output/example_with_alternative_text.json",
+ ),
("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
(
"html_files/example_with_inline_fields.html",
@@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
html_file_path = Path(__file__).parent / html_file_path
json_file_path = Path(__file__).parent / json_file_path
-
expected_json_elements = elements_from_json(str(json_file_path))
html_code = html_file_path.read_text()
predicted_elements = partition_html(
text=html_code, html_parser_version="v2", unique_element_ids=True
)
+
assert len(expected_json_elements) == len(predicted_elements)
for i in range(len(expected_json_elements)):
diff --git a/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json b/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json
new file mode 100644
index 0000000000..f6c32707ea
--- /dev/null
+++ b/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json
@@ -0,0 +1,62 @@
+[
+ {
+ "element_id": "3a6b156a81764e17be128264241f8136",
+ "metadata": {
+ "category_depth": 0,
+ "filetype": "text/html",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "parent_id": "897a8a47377c4ad6aab839a929879537",
+ "text_as_html": ""
+ },
+ "text": "",
+ "type": "UncategorizedText"
+ },
+ {
+ "element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
+ "metadata": {
+ "category_depth": 1,
+ "filetype": "text/html",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "parent_id": "3a6b156a81764e17be128264241f8136",
+ "text_as_html": ""
+ },
+ "text": "",
+ "type": "UncategorizedText"
+ },
+ {
+ "element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
+ "metadata": {
+ "category_depth": 2,
+ "filetype": "text/html",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
+ "text_as_html": ""
+ },
+ "text": "New York logo",
+ "type": "Image"
+ },
+ {
+ "element_id": "40c32fd8-9a02-42b8-a587-884293881090",
+ "metadata": {
+ "category_depth": 2,
+ "filetype": "text/html",
+ "languages": [
+ "eng"
+ ],
+ "page_number": 1,
+ "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
+ "text_as_html": ""
+ },
+ "text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
+ "type": "Image"
+ }
+]
\ No newline at end of file
diff --git a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py
index a3edcfc024..f4bc09d54a 100644
--- a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py
+++ b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py
@@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
assert len(unstructured_elements) == 2
assert isinstance(unstructured_elements[0], Text)
assert isinstance(unstructured_elements[1], NarrativeText)
+
+
+def test_alternate_text_from_image_is_passed():
+ # language=HTML
+ input_html = """
+
+
+
+ Example image nested in the table: |
+ |
+
+
+
add_img_alt_text
+ """
+ page = parse_html_to_ontology(input_html)
+ unstructured_elements = ontology_to_unstructured_elements(page)
+ assert len(unstructured_elements) == 2
+ assert "ALT TEXT Logo" in unstructured_elements[1].text
diff --git a/test_unstructured/partition/html/test_partition_v2.py b/test_unstructured/partition/html/test_partition_v2.py
new file mode 100644
index 0000000000..4823646f49
--- /dev/null
+++ b/test_unstructured/partition/html/test_partition_v2.py
@@ -0,0 +1,47 @@
+from unstructured.partition.html import partition_html
+
+
+def test_alternative_image_text_can_be_included():
+ # language=HTML
+ html = """
+
+
+
+ """
+ _, image_to_text_alt_mode = partition_html(
+ text=html,
+ image_alt_mode="to_text",
+ html_parser_version="v2",
+ )
+ assert "ALT TEXT Logo" in image_to_text_alt_mode.text
+
+ _, image_none_alt_mode = partition_html(
+ text=html,
+ image_alt_mode=None,
+ html_parser_version="v2",
+ )
+ assert "ALT TEXT Logo" not in image_none_alt_mode.text
+
+
+def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
+ # language=HTML
+ html = """
+
+
+
+
+
+ """
+ _, paragraph_to_text_alt_mode = partition_html(
+ text=html,
+ image_alt_mode="to_text",
+ html_parser_version="v2",
+ )
+ assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text
+
+ _, paragraph_none_alt_mode = partition_html(
+ text=html,
+ image_alt_mode=None,
+ html_parser_version="v2",
+ )
+ assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index e4054e64f3..8685b152b7 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.6" # pragma: no cover
+__version__ = "0.16.7" # pragma: no cover
diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py
index f2142b82bb..4ca550b8d7 100644
--- a/unstructured/documents/ontology.py
+++ b/unstructured/documents/ontology.py
@@ -89,11 +89,27 @@ def to_html(self, add_children=True) -> str:
return result_html
- def to_text(self, add_children=True) -> str:
+ def to_text(self, add_children=True, add_img_alt_text=True) -> str:
+ """
+ Returns the text representation of the element.
+
+ Args:
+ add_children: If True, the text of the children will be included.
+ Otherwise, element is represented as single self-closing tag.
+ add_img_alt_text: If True, the alt text of the image will be included.
+ """
if self.children and add_children:
- children_text = " ".join(child.to_text().strip() for child in self.children)
+ children_text = " ".join(
+ child.to_text(add_children, add_img_alt_text).strip() for child in self.children
+ )
return children_text
- return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
+
+ text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
+
+ if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
+ text += f" {self.additional_attributes.get('alt', '')}"
+
+ return text.strip()
def _construct_attribute_string(self, attributes: dict) -> str:
return " ".join(
@@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
allowed_tags: List[str] = Field(["input"], frozen=True)
- def to_text(self, add_children=True) -> str:
- text = super().to_text()
+ def to_text(self, add_children=True, add_img_alt_text=True) -> str:
+ text = super().to_text(add_children, add_img_alt_text)
value = self.additional_attributes.get("value", "")
if not value:
return text
diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py
index f49643bd84..865a727b23 100644
--- a/unstructured/partition/html/partition.py
+++ b/unstructured/partition/html/partition.py
@@ -36,6 +36,7 @@ def partition_html(
skip_headers_and_footers: bool = False,
detection_origin: Optional[str] = None,
html_parser_version: Literal["v1", "v2"] = "v1",
+ image_alt_mode: Optional[Literal["to_text"]] = "to_text",
**kwargs: Any,
) -> list[Element]:
"""Partitions an HTML document into its constituent elements.
@@ -65,6 +66,9 @@ def partition_html(
html_parser_version (Literal['v1', 'v2']):
The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
use the ontology schema to parse the HTML document.
+
+ image_alt_mode (Literal['to_text']):
+ When set 'to_text', the v2 parser will include the alternative text of images in the output.
"""
# -- parser rejects an empty str, nip that edge-case in the bud here --
if text is not None and text.strip() == "" and not file and not filename and not url:
@@ -81,6 +85,7 @@ def partition_html(
skip_headers_and_footers=skip_headers_and_footers,
detection_origin=detection_origin,
html_parser_version=html_parser_version,
+ image_alt_mode=image_alt_mode,
)
return list(_HtmlPartitioner.iter_elements(opts))
@@ -102,6 +107,7 @@ def __init__(
skip_headers_and_footers: bool,
detection_origin: str | None,
html_parser_version: Literal["v1", "v2"] = "v1",
+ image_alt_mode: Optional[Literal["to_text"]] = "to_text",
):
self._file_path = file_path
self._file = file
@@ -113,6 +119,7 @@ def __init__(
self._skip_headers_and_footers = skip_headers_and_footers
self._detection_origin = detection_origin
self._html_parser_version = html_parser_version
+ self._image_alt_mode = image_alt_mode
@lazyproperty
def detection_origin(self) -> str | None:
@@ -172,6 +179,11 @@ def html_parser_version(self) -> Literal["v1", "v2"]:
"""When html_parser_version=='v2', HTML elements follow ontology schema."""
return self._html_parser_version
+ @lazyproperty
+ def add_img_alt_text(self) -> bool:
+ """When True, the alternative text of images is included in the output."""
+ return self._image_alt_mode == "to_text"
+
class _HtmlPartitioner:
"""Partition HTML document into document-elements."""
@@ -239,5 +251,7 @@ def _from_ontology(self) -> List[Element]:
"""Convert an ontology elements represented in HTML to an ontology element."""
html_text = self._opts.html_text
ontology = parse_html_to_ontology(html_text)
- unstructured_elements = ontology_to_unstructured_elements(ontology)
+ unstructured_elements = ontology_to_unstructured_elements(
+ ontology, add_img_alt_text=self._opts.add_img_alt_text
+ )
return unstructured_elements
diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py
index 0f12c967bb..d90e589f8f 100644
--- a/unstructured/partition/html/transformations.py
+++ b/unstructured/partition/html/transformations.py
@@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
page_number: int = None,
depth: int = 0,
filename: str | None = None,
+ add_img_alt_text: bool = True,
) -> list[elements.Element]:
"""
Converts an OntologyElement object to a list of unstructured Element objects.
@@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
parent_id (str, optional): The ID of the parent element. Defaults to None.
page_number (int, optional): The page number of the element. Defaults to None.
depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
-
+ filename (str, optional): The name of the file the element comes from. Defaults to None.
+ add_img_alt_text (bool): Whether to include the alternative text of images
+ in the output. Defaults to True.
Returns:
list[Element]: A list of unstructured Element objects.
"""
@@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
page_number=page_number,
depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
filename=filename,
+ add_img_alt_text=add_img_alt_text,
)
children += child
@@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
else:
element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
html_code_of_ontology_element = ontology_element.to_html()
- element_text = ontology_element.to_text()
+ element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)
unstructured_element = element_class(
text=element_text,
@@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
Args:
html_code (str): The HTML code to be parsed.
Parsing HTML will start from .
-
Returns:
OntologyElement: The parsed Element object.
@@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
Args:
soup (Tag): The BeautifulSoup Tag object to be converted.
recursion_depth (int): Flag to control limit of recursion depth.
-
Returns:
OntologyElement: The converted OntologyElement object.
"""