From 4140f625d0a20dc09c9f4ce8ac72ad85b5e62446 Mon Sep 17 00:00:00 2001 From: Marianna Date: Thu, 5 Dec 2024 04:46:51 +0100 Subject: [PATCH] add script to render html from unstructured elements (#3799) Script to render HTML from unstructured elements. NOTE: This script is not intended to be used as a module. NOTE: This script is only intended to be used with outputs with non-empty `metadata.text_as_html`. TODO: It was noted that unstructured_elements_to_ontology func always returns a single page This script is using helper functions to handle multiple pages. I am not sure if this was intended, or it is a bug - if it is a bug it would require bit longer debugging - to make it usable fast I used workarounds. Usage: test with any outputs with non-empty `metadata.text_as_html`. Example files attached. `[Example-Bill-of-Lading-Waste.docx.pdf.json](https://github.com/user-attachments/files/17922898/Example-Bill-of-Lading-Waste.docx.pdf.json)` [Breast_Cancer1-5.pdf.json](https://github.com/user-attachments/files/17922899/Breast_Cancer1-5.pdf.json) --- scripts/html/rendered_html_from_elements.py | 146 ++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 scripts/html/rendered_html_from_elements.py diff --git a/scripts/html/rendered_html_from_elements.py b/scripts/html/rendered_html_from_elements.py new file mode 100644 index 0000000000..5789a83d14 --- /dev/null +++ b/scripts/html/rendered_html_from_elements.py @@ -0,0 +1,146 @@ +# pyright: reportPrivateUsage=false + +""" +Script to render HTML from unstructured elements. +NOTE: This script is not intended to be used as a module. +NOTE: For now script is only intended to be used with elements generated with + `partition_html(html_parser_version=v2)` +TODO: It was noted that unstructured_elements_to_ontology func always returns a single page + This script is using helper functions to handle multiple pages. +""" + +import argparse +import logging +import os +import select +import sys +from collections import defaultdict +from typing import List, Sequence + +from bs4 import BeautifulSoup + +from unstructured.documents import elements +from unstructured.partition.html.transformations import unstructured_elements_to_ontology +from unstructured.staging.base import elements_from_json + +# Configure logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + + +def extract_document_div(html_content: str) -> str: + pos = html_content.find(">") + if pos != -1: + return html_content[: pos + 1] + logger.error("No '>' found in the HTML content.") + raise ValueError("No '>' found in the HTML content.") + + +def extract_page_div(html_content: str) -> str: + soup = BeautifulSoup(html_content, "html.parser") + page_divs = soup.find_all("div", class_="Page") + if len(page_divs) != 1: + logger.error( + "Expected exactly one
element with class 'Page'. Found %d.", len(page_divs) + ) + raise ValueError("Expected exactly one
element with class 'Page'.") + return str(page_divs[0]) + + +def fold_document_div( + html_document_start: str, html_document_end: str, html_per_page: List[str] +) -> str: + html_document = html_document_start + for page_html in html_per_page: + html_document += page_html + html_document += html_document_end + return html_document + + +def group_elements_by_page( + unstructured_elements: Sequence[elements.Element], +) -> Sequence[Sequence[elements.Element]]: + pages_dict = defaultdict(list) + + for element in unstructured_elements: + page_number = element.metadata.page_number + pages_dict[page_number].append(element) + + pages_list = list(pages_dict.values()) + return pages_list + + +def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str: + """Renders HTML from a JSON file with unstructured elements. + + Args: + filepath (str): path to JSON file with unstructured elements. + + Returns: + str: HTML content. + """ + if filepath is None and text is None: + logger.error("Either filepath or text must be provided.") + raise ValueError("Either filepath or text must be provided.") + if filepath is not None and text is not None: + logger.error("Both filepath and text cannot be provided.") + raise ValueError("Both filepath and text cannot be provided.") + if filepath is not None: + logger.info("Rendering HTML from file: %s", filepath) + else: + logger.info("Rendering HTML from text.") + + unstructured_elements = elements_from_json(filename=filepath, text=text) + unstructured_elements_per_page = group_elements_by_page(unstructured_elements) + # parsed_ontology = unstructured_elements_to_ontology(unstructured_elements) + parsed_ontology_per_page = [ + unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page + ] + html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page] + + html_document_start = extract_document_div(html_per_page[0]) + html_document_end = "
" + html_per_page = [extract_page_div(page) for page in html_per_page] + + return fold_document_div(html_document_start, html_document_end, html_per_page) + + +def _main(): + if os.getenv("PROCESS_FROM_STDIN") == "true": + logger.info("Processing from STDIN (PROCESS_FROM_STDIN is set to 'true')") + if select.select([sys.stdin], [], [], 0.1)[0]: + content = sys.stdin.read() + html = rendered_html(text=content) + sys.stdout.write(html) + else: + logger.error("No input provided via STDIN. Exiting.") + sys.exit(1) + else: + logger.info("Processing from command line arguments") + parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.") + parser.add_argument( + "filepath", help="Path to JSON file with unstructured elements.", type=str + ) + parser.add_argument( + "--outdir", + help="Path to directory where the rendered html will be stored.", + type=str, + default=None, + nargs="?", + ) + args = parser.parse_args() + + html = rendered_html(filepath=args.filepath) + if args.outdir is None: + args.outdir = os.path.dirname(args.filepath) + os.makedirs(args.outdir, exist_ok=True) + outpath = os.path.join( + args.outdir, os.path.basename(args.filepath).replace(".json", ".html") + ) + with open(outpath, "w") as f: + f.write(html) + logger.info("HTML rendered and saved to: %s", outpath) + + +if __name__ == "__main__": + _main()