image alt support (#3797)

Unstructured-IO · Nov 26, 2024 · e48d79e · e48d79e
1 parent 626f73a
commit e48d79e
Show file tree

Hide file tree

Showing 10 changed files with 192 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## 0.16.7
+
+### Enhancements
+- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from <img> html tags
+
+### Features
+
+### Fixes
+
 ## 0.16.6
 
 ### Enhancements

diff --git a/test_unstructured/documents/html_files/example_with_alternative_text.html b/test_unstructured/documents/html_files/example_with_alternative_text.html
@@ -0,0 +1,8 @@
+<body class="Document" id="897a8a47377c4ad6aab839a929879537">
+ <div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
+   <header class="Header" id="6135aeb6-9558-46e2-9da4-473a74db3e9d">
+    <img alt="New York logo" class="Logo" id="33d66969-b274-4f88-abaa-e7f258b1595f"/>
+    <img alt="A line graph showing the comparison of 5 year cumulative total return for stocks" class="Image" id="40c32fd8-9a02-42b8-a587-884293881090"/>
+   </header>
+ </div>
+</body>
diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py
@@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
     [
         ("html_files/example.html", "unstructured_json_output/example.json"),
         ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"),
+        (
+            "html_files/example_with_alternative_text.html",
+            "unstructured_json_output/example_with_alternative_text.json",
+        ),
         ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"),
         (
             "html_files/example_with_inline_fields.html",
@@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path):
 def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path):
     html_file_path = Path(__file__).parent / html_file_path
     json_file_path = Path(__file__).parent / json_file_path
-
     expected_json_elements = elements_from_json(str(json_file_path))
     html_code = html_file_path.read_text()
 
     predicted_elements = partition_html(
         text=html_code, html_parser_version="v2", unique_element_ids=True
     )
+
     assert len(expected_json_elements) == len(predicted_elements)
 
     for i in range(len(expected_json_elements)):

diff --git a/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json b/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json
@@ -0,0 +1,62 @@
+[
+    {
+        "element_id": "3a6b156a81764e17be128264241f8136",
+        "metadata": {
+            "category_depth": 0,
+            "filetype": "text/html",
+            "languages": [
+                "eng"
+            ],
+            "page_number": 1,
+            "parent_id": "897a8a47377c4ad6aab839a929879537",
+            "text_as_html": "<div class=\"Page\" data-page-number=\"1\" id=\"3a6b156a81764e17be128264241f8136\" />"
+        },
+        "text": "",
+        "type": "UncategorizedText"
+    },
+    {
+        "element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
+        "metadata": {
+            "category_depth": 1,
+            "filetype": "text/html",
+            "languages": [
+                "eng"
+            ],
+            "page_number": 1,
+            "parent_id": "3a6b156a81764e17be128264241f8136",
+            "text_as_html": "<header class=\"Header\" id=\"6135aeb6-9558-46e2-9da4-473a74db3e9d\" />"
+        },
+        "text": "",
+        "type": "UncategorizedText"
+    },
+    {
+        "element_id": "33d66969-b274-4f88-abaa-e7f258b1595f",
+        "metadata": {
+            "category_depth": 2,
+            "filetype": "text/html",
+            "languages": [
+                "eng"
+            ],
+            "page_number": 1,
+            "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
+            "text_as_html": "<img class=\"Logo\" alt=\"New York logo\" id=\"33d66969-b274-4f88-abaa-e7f258b1595f\" />"
+        },
+        "text": "New York logo",
+        "type": "Image"
+    },
+    {
+        "element_id": "40c32fd8-9a02-42b8-a587-884293881090",
+        "metadata": {
+            "category_depth": 2,
+            "filetype": "text/html",
+            "languages": [
+                "eng"
+            ],
+            "page_number": 1,
+            "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d",
+            "text_as_html": "<img class=\"Image\" alt=\"A line graph showing the comparison of 5 year cumulative total return for stocks\" id=\"40c32fd8-9a02-42b8-a587-884293881090\" />"
+        },
+        "text": "A line graph showing the comparison of 5 year cumulative total return for stocks",
+        "type": "Image"
+    }
+]
diff --git a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py
@@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs():
     assert len(unstructured_elements) == 2
     assert isinstance(unstructured_elements[0], Text)
     assert isinstance(unstructured_elements[1], NarrativeText)
+
+
+def test_alternate_text_from_image_is_passed():
+    # language=HTML
+    input_html = """
+    <div class="Page">
+    <table>
+        <tr>
+            <td rowspan="2">Example image nested in the table:</td>
+            <td rowspan="2"><img src="my-logo.png" alt="ALT TEXT Logo"></td>
+        </tr>
+    </table>
+    </div>add_img_alt_text
+    """
+    page = parse_html_to_ontology(input_html)
+    unstructured_elements = ontology_to_unstructured_elements(page)
+    assert len(unstructured_elements) == 2
+    assert "ALT TEXT Logo" in unstructured_elements[1].text
diff --git a/test_unstructured/partition/html/test_partition_v2.py b/test_unstructured/partition/html/test_partition_v2.py
@@ -0,0 +1,47 @@
+from unstructured.partition.html import partition_html
+
+
+def test_alternative_image_text_can_be_included():
+    # language=HTML
+    html = """
+    <div class="Page">
+        <img src="my-logo.png" alt="ALT TEXT Logo"/>
+    </div>
+    """
+    _, image_to_text_alt_mode = partition_html(
+        text=html,
+        image_alt_mode="to_text",
+        html_parser_version="v2",
+    )
+    assert "ALT TEXT Logo" in image_to_text_alt_mode.text
+
+    _, image_none_alt_mode = partition_html(
+        text=html,
+        image_alt_mode=None,
+        html_parser_version="v2",
+    )
+    assert "ALT TEXT Logo" not in image_none_alt_mode.text
+
+
+def test_alternative_image_text_can_be_included_when_nested_in_paragraph():
+    # language=HTML
+    html = """
+    <div class="Page">
+        <p class="Paragraph">
+            <img src="my-logo.png" alt="ALT TEXT Logo"/>
+        </p>
+    </div>
+    """
+    _, paragraph_to_text_alt_mode = partition_html(
+        text=html,
+        image_alt_mode="to_text",
+        html_parser_version="v2",
+    )
+    assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text
+
+    _, paragraph_none_alt_mode = partition_html(
+        text=html,
+        image_alt_mode=None,
+        html_parser_version="v2",
+    )
+    assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.6"  # pragma: no cover
+__version__ = "0.16.7"  # pragma: no cover
diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py
@@ -89,11 +89,27 @@ def to_html(self, add_children=True) -> str:
 
         return result_html
 
-    def to_text(self, add_children=True) -> str:
+    def to_text(self, add_children=True, add_img_alt_text=True) -> str:
+        """
+        Returns the text representation of the element.
+
+        Args:
+            add_children: If True, the text of the children will be included.
+                            Otherwise, element is represented as single self-closing tag.
+            add_img_alt_text: If True, the alt text of the image will be included.
+        """
         if self.children and add_children:
-            children_text = " ".join(child.to_text().strip() for child in self.children)
+            children_text = " ".join(
+                child.to_text(add_children, add_img_alt_text).strip() for child in self.children
+            )
             return children_text
-        return BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
+
+        text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip()
+
+        if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes:
+            text += f" {self.additional_attributes.get('alt', '')}"
+
+        return text.strip()
 
     def _construct_attribute_string(self, attributes: dict) -> str:
         return " ".join(
@@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement):
     elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True)
     allowed_tags: List[str] = Field(["input"], frozen=True)
 
-    def to_text(self, add_children=True) -> str:
-        text = super().to_text()
+    def to_text(self, add_children=True, add_img_alt_text=True) -> str:
+        text = super().to_text(add_children, add_img_alt_text)
         value = self.additional_attributes.get("value", "")
         if not value:
             return text

diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py
@@ -36,6 +36,7 @@ def partition_html(
     skip_headers_and_footers: bool = False,
     detection_origin: Optional[str] = None,
     html_parser_version: Literal["v1", "v2"] = "v1",
+    image_alt_mode: Optional[Literal["to_text"]] = "to_text",
     **kwargs: Any,
 ) -> list[Element]:
     """Partitions an HTML document into its constituent elements.
@@ -65,6 +66,9 @@ def partition_html(
     html_parser_version (Literal['v1', 'v2']):
         The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
         use the ontology schema to parse the HTML document.
+
+    image_alt_mode (Literal['to_text']):
+        When set 'to_text', the v2 parser will include the alternative text of images in the output.
     """
     # -- parser rejects an empty str, nip that edge-case in the bud here --
     if text is not None and text.strip() == "" and not file and not filename and not url:
@@ -81,6 +85,7 @@ def partition_html(
         skip_headers_and_footers=skip_headers_and_footers,
         detection_origin=detection_origin,
         html_parser_version=html_parser_version,
+        image_alt_mode=image_alt_mode,
     )
 
     return list(_HtmlPartitioner.iter_elements(opts))
@@ -102,6 +107,7 @@ def __init__(
         skip_headers_and_footers: bool,
         detection_origin: str | None,
         html_parser_version: Literal["v1", "v2"] = "v1",
+        image_alt_mode: Optional[Literal["to_text"]] = "to_text",
     ):
         self._file_path = file_path
         self._file = file
@@ -113,6 +119,7 @@ def __init__(
         self._skip_headers_and_footers = skip_headers_and_footers
         self._detection_origin = detection_origin
         self._html_parser_version = html_parser_version
+        self._image_alt_mode = image_alt_mode
 
     @lazyproperty
     def detection_origin(self) -> str | None:
@@ -172,6 +179,11 @@ def html_parser_version(self) -> Literal["v1", "v2"]:
         """When html_parser_version=='v2', HTML elements follow ontology schema."""
         return self._html_parser_version
 
+    @lazyproperty
+    def add_img_alt_text(self) -> bool:
+        """When True, the alternative text of images is included in the output."""
+        return self._image_alt_mode == "to_text"
+
 
 class _HtmlPartitioner:
     """Partition HTML document into document-elements."""
@@ -239,5 +251,7 @@ def _from_ontology(self) -> List[Element]:
         """Convert an ontology elements represented in HTML to an ontology element."""
         html_text = self._opts.html_text
         ontology = parse_html_to_ontology(html_text)
-        unstructured_elements = ontology_to_unstructured_elements(ontology)
+        unstructured_elements = ontology_to_unstructured_elements(
+            ontology, add_img_alt_text=self._opts.add_img_alt_text
+        )
         return unstructured_elements
diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py
@@ -24,6 +24,7 @@ def ontology_to_unstructured_elements(
     page_number: int = None,
     depth: int = 0,
     filename: str | None = None,
+    add_img_alt_text: bool = True,
 ) -> list[elements.Element]:
     """
     Converts an OntologyElement object to a list of unstructured Element objects.
@@ -44,7 +45,9 @@ def ontology_to_unstructured_elements(
         parent_id (str, optional): The ID of the parent element. Defaults to None.
         page_number (int, optional): The page number of the element. Defaults to None.
         depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
-
+        filename (str, optional): The name of the file the element comes from. Defaults to None.
+        add_img_alt_text (bool): Whether to include the alternative text of images
+                                            in the output. Defaults to True.
     Returns:
         list[Element]: A list of unstructured Element objects.
     """
@@ -77,6 +80,7 @@ def ontology_to_unstructured_elements(
                 page_number=page_number,
                 depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1,
                 filename=filename,
+                add_img_alt_text=add_img_alt_text,
             )
             children += child
 
@@ -85,7 +89,7 @@ def ontology_to_unstructured_elements(
     else:
         element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__]
         html_code_of_ontology_element = ontology_element.to_html()
-        element_text = ontology_element.to_text()
+        element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text)
 
         unstructured_element = element_class(
             text=element_text,
@@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement:
     Args:
         html_code (str): The HTML code to be parsed.
             Parsing HTML will start from <div class="Page">.
-
     Returns:
         OntologyElement: The parsed Element object.
 
@@ -352,7 +355,6 @@ def parse_html_to_ontology_element(
     Args:
         soup (Tag): The BeautifulSoup Tag object to be converted.
         recursion_depth (int): Flag to control limit of recursion depth.
-
     Returns:
         OntologyElement: The converted OntologyElement object.
     """
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.16.6" # pragma: no cover
		__version__ = "0.16.7" # pragma: no cover