diff --git a/CHANGELOG.md b/CHANGELOG.md index 26a3d559f2..8bcfb375d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.16.7 + +### Enhancements +- **Add image_alt_mode to partition_html** Adds an `image_alt_mode` parameter to `partition_html()` to control how alt text is extracted from images in HTML documents. The parameter can be set to `to_text` to extract alt text as text from html tags + +### Features + +### Fixes + ## 0.16.6 ### Enhancements diff --git a/test_unstructured/documents/html_files/example_with_alternative_text.html b/test_unstructured/documents/html_files/example_with_alternative_text.html new file mode 100644 index 0000000000..64cbf2f605 --- /dev/null +++ b/test_unstructured/documents/html_files/example_with_alternative_text.html @@ -0,0 +1,8 @@ + +
+
+ + A line graph showing the comparison of 5 year cumulative total return for stocks +
+
+ diff --git a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py index 72dec7d02a..f489be0363 100644 --- a/test_unstructured/documents/test_ontology_to_unstructured_parsing.py +++ b/test_unstructured/documents/test_ontology_to_unstructured_parsing.py @@ -181,6 +181,10 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): [ ("html_files/example.html", "unstructured_json_output/example.json"), ("html_files/example_full_doc.html", "unstructured_json_output/example_full_doc.json"), + ( + "html_files/example_with_alternative_text.html", + "unstructured_json_output/example_with_alternative_text.json", + ), ("html_files/three_tables.html", "unstructured_json_output/three_tables.json"), ( "html_files/example_with_inline_fields.html", @@ -191,13 +195,13 @@ def test_parsed_ontology_can_be_serialized_from_json(json_file_path): def test_parsed_ontology_can_be_serialized_from_html(html_file_path, json_file_path): html_file_path = Path(__file__).parent / html_file_path json_file_path = Path(__file__).parent / json_file_path - expected_json_elements = elements_from_json(str(json_file_path)) html_code = html_file_path.read_text() predicted_elements = partition_html( text=html_code, html_parser_version="v2", unique_element_ids=True ) + assert len(expected_json_elements) == len(predicted_elements) for i in range(len(expected_json_elements)): diff --git a/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json b/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json new file mode 100644 index 0000000000..f6c32707ea --- /dev/null +++ b/test_unstructured/documents/unstructured_json_output/example_with_alternative_text.json @@ -0,0 +1,62 @@ +[ + { + "element_id": "3a6b156a81764e17be128264241f8136", + "metadata": { + "category_depth": 0, + "filetype": "text/html", + "languages": [ + "eng" + ], + "page_number": 1, + "parent_id": "897a8a47377c4ad6aab839a929879537", + "text_as_html": "
" + }, + "text": "", + "type": "UncategorizedText" + }, + { + "element_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", + "metadata": { + "category_depth": 1, + "filetype": "text/html", + "languages": [ + "eng" + ], + "page_number": 1, + "parent_id": "3a6b156a81764e17be128264241f8136", + "text_as_html": "
" + }, + "text": "", + "type": "UncategorizedText" + }, + { + "element_id": "33d66969-b274-4f88-abaa-e7f258b1595f", + "metadata": { + "category_depth": 2, + "filetype": "text/html", + "languages": [ + "eng" + ], + "page_number": 1, + "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", + "text_as_html": "\"New" + }, + "text": "New York logo", + "type": "Image" + }, + { + "element_id": "40c32fd8-9a02-42b8-a587-884293881090", + "metadata": { + "category_depth": 2, + "filetype": "text/html", + "languages": [ + "eng" + ], + "page_number": 1, + "parent_id": "6135aeb6-9558-46e2-9da4-473a74db3e9d", + "text_as_html": "\"A" + }, + "text": "A line graph showing the comparison of 5 year cumulative total return for stocks", + "type": "Image" + } +] \ No newline at end of file diff --git a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py index a3edcfc024..f4bc09d54a 100644 --- a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py +++ b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py @@ -555,3 +555,21 @@ def test_inline_elements_are_squeezed_when_text_wrapped_into_paragraphs(): assert len(unstructured_elements) == 2 assert isinstance(unstructured_elements[0], Text) assert isinstance(unstructured_elements[1], NarrativeText) + + +def test_alternate_text_from_image_is_passed(): + # language=HTML + input_html = """ +
+ + + + + +
Example image nested in the table:ALT TEXT Logo
+
add_img_alt_text + """ + page = parse_html_to_ontology(input_html) + unstructured_elements = ontology_to_unstructured_elements(page) + assert len(unstructured_elements) == 2 + assert "ALT TEXT Logo" in unstructured_elements[1].text diff --git a/test_unstructured/partition/html/test_partition_v2.py b/test_unstructured/partition/html/test_partition_v2.py new file mode 100644 index 0000000000..4823646f49 --- /dev/null +++ b/test_unstructured/partition/html/test_partition_v2.py @@ -0,0 +1,47 @@ +from unstructured.partition.html import partition_html + + +def test_alternative_image_text_can_be_included(): + # language=HTML + html = """ +
+ ALT TEXT Logo +
+ """ + _, image_to_text_alt_mode = partition_html( + text=html, + image_alt_mode="to_text", + html_parser_version="v2", + ) + assert "ALT TEXT Logo" in image_to_text_alt_mode.text + + _, image_none_alt_mode = partition_html( + text=html, + image_alt_mode=None, + html_parser_version="v2", + ) + assert "ALT TEXT Logo" not in image_none_alt_mode.text + + +def test_alternative_image_text_can_be_included_when_nested_in_paragraph(): + # language=HTML + html = """ +
+

+ ALT TEXT Logo +

+
+ """ + _, paragraph_to_text_alt_mode = partition_html( + text=html, + image_alt_mode="to_text", + html_parser_version="v2", + ) + assert "ALT TEXT Logo" in paragraph_to_text_alt_mode.text + + _, paragraph_none_alt_mode = partition_html( + text=html, + image_alt_mode=None, + html_parser_version="v2", + ) + assert "ALT TEXT Logo" not in paragraph_none_alt_mode.text diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e4054e64f3..8685b152b7 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.6" # pragma: no cover +__version__ = "0.16.7" # pragma: no cover diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py index f2142b82bb..4ca550b8d7 100644 --- a/unstructured/documents/ontology.py +++ b/unstructured/documents/ontology.py @@ -89,11 +89,27 @@ def to_html(self, add_children=True) -> str: return result_html - def to_text(self, add_children=True) -> str: + def to_text(self, add_children=True, add_img_alt_text=True) -> str: + """ + Returns the text representation of the element. + + Args: + add_children: If True, the text of the children will be included. + Otherwise, element is represented as single self-closing tag. + add_img_alt_text: If True, the alt text of the image will be included. + """ if self.children and add_children: - children_text = " ".join(child.to_text().strip() for child in self.children) + children_text = " ".join( + child.to_text(add_children, add_img_alt_text).strip() for child in self.children + ) return children_text - return BeautifulSoup(self.to_html(), "html.parser").get_text().strip() + + text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip() + + if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes: + text += f" {self.additional_attributes.get('alt', '')}" + + return text.strip() def _construct_attribute_string(self, attributes: dict) -> str: return " ".join( @@ -473,8 +489,8 @@ class FormFieldValue(OntologyElement): elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["input"], frozen=True) - def to_text(self, add_children=True) -> str: - text = super().to_text() + def to_text(self, add_children=True, add_img_alt_text=True) -> str: + text = super().to_text(add_children, add_img_alt_text) value = self.additional_attributes.get("value", "") if not value: return text diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py index f49643bd84..865a727b23 100644 --- a/unstructured/partition/html/partition.py +++ b/unstructured/partition/html/partition.py @@ -36,6 +36,7 @@ def partition_html( skip_headers_and_footers: bool = False, detection_origin: Optional[str] = None, html_parser_version: Literal["v1", "v2"] = "v1", + image_alt_mode: Optional[Literal["to_text"]] = "to_text", **kwargs: Any, ) -> list[Element]: """Partitions an HTML document into its constituent elements. @@ -65,6 +66,9 @@ def partition_html( html_parser_version (Literal['v1', 'v2']): The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will use the ontology schema to parse the HTML document. + + image_alt_mode (Literal['to_text']): + When set 'to_text', the v2 parser will include the alternative text of images in the output. """ # -- parser rejects an empty str, nip that edge-case in the bud here -- if text is not None and text.strip() == "" and not file and not filename and not url: @@ -81,6 +85,7 @@ def partition_html( skip_headers_and_footers=skip_headers_and_footers, detection_origin=detection_origin, html_parser_version=html_parser_version, + image_alt_mode=image_alt_mode, ) return list(_HtmlPartitioner.iter_elements(opts)) @@ -102,6 +107,7 @@ def __init__( skip_headers_and_footers: bool, detection_origin: str | None, html_parser_version: Literal["v1", "v2"] = "v1", + image_alt_mode: Optional[Literal["to_text"]] = "to_text", ): self._file_path = file_path self._file = file @@ -113,6 +119,7 @@ def __init__( self._skip_headers_and_footers = skip_headers_and_footers self._detection_origin = detection_origin self._html_parser_version = html_parser_version + self._image_alt_mode = image_alt_mode @lazyproperty def detection_origin(self) -> str | None: @@ -172,6 +179,11 @@ def html_parser_version(self) -> Literal["v1", "v2"]: """When html_parser_version=='v2', HTML elements follow ontology schema.""" return self._html_parser_version + @lazyproperty + def add_img_alt_text(self) -> bool: + """When True, the alternative text of images is included in the output.""" + return self._image_alt_mode == "to_text" + class _HtmlPartitioner: """Partition HTML document into document-elements.""" @@ -239,5 +251,7 @@ def _from_ontology(self) -> List[Element]: """Convert an ontology elements represented in HTML to an ontology element.""" html_text = self._opts.html_text ontology = parse_html_to_ontology(html_text) - unstructured_elements = ontology_to_unstructured_elements(ontology) + unstructured_elements = ontology_to_unstructured_elements( + ontology, add_img_alt_text=self._opts.add_img_alt_text + ) return unstructured_elements diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index 0f12c967bb..d90e589f8f 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -24,6 +24,7 @@ def ontology_to_unstructured_elements( page_number: int = None, depth: int = 0, filename: str | None = None, + add_img_alt_text: bool = True, ) -> list[elements.Element]: """ Converts an OntologyElement object to a list of unstructured Element objects. @@ -44,7 +45,9 @@ def ontology_to_unstructured_elements( parent_id (str, optional): The ID of the parent element. Defaults to None. page_number (int, optional): The page number of the element. Defaults to None. depth (int, optional): The depth of the element in the hierarchy. Defaults to 0. - + filename (str, optional): The name of the file the element comes from. Defaults to None. + add_img_alt_text (bool): Whether to include the alternative text of images + in the output. Defaults to True. Returns: list[Element]: A list of unstructured Element objects. """ @@ -77,6 +80,7 @@ def ontology_to_unstructured_elements( page_number=page_number, depth=0 if isinstance(ontology_element, ontology.Document) else depth + 1, filename=filename, + add_img_alt_text=add_img_alt_text, ) children += child @@ -85,7 +89,7 @@ def ontology_to_unstructured_elements( else: element_class = ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE[ontology_element.__class__] html_code_of_ontology_element = ontology_element.to_html() - element_text = ontology_element.to_text() + element_text = ontology_element.to_text(add_img_alt_text=add_img_alt_text) unstructured_element = element_class( text=element_text, @@ -278,7 +282,6 @@ def parse_html_to_ontology(html_code: str) -> ontology.OntologyElement: Args: html_code (str): The HTML code to be parsed. Parsing HTML will start from
. - Returns: OntologyElement: The parsed Element object. @@ -352,7 +355,6 @@ def parse_html_to_ontology_element( Args: soup (Tag): The BeautifulSoup Tag object to be converted. recursion_depth (int): Flag to control limit of recursion depth. - Returns: OntologyElement: The converted OntologyElement object. """