diff --git a/scripts/html/rendered_html_from_elements.py b/scripts/html/rendered_html_from_elements.py new file mode 100644 index 0000000000..5789a83d14 --- /dev/null +++ b/scripts/html/rendered_html_from_elements.py @@ -0,0 +1,146 @@ +# pyright: reportPrivateUsage=false + +""" +Script to render HTML from unstructured elements. +NOTE: This script is not intended to be used as a module. +NOTE: For now script is only intended to be used with elements generated with + `partition_html(html_parser_version=v2)` +TODO: It was noted that unstructured_elements_to_ontology func always returns a single page + This script is using helper functions to handle multiple pages. +""" + +import argparse +import logging +import os +import select +import sys +from collections import defaultdict +from typing import List, Sequence + +from bs4 import BeautifulSoup + +from unstructured.documents import elements +from unstructured.partition.html.transformations import unstructured_elements_to_ontology +from unstructured.staging.base import elements_from_json + +# Configure logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + + +def extract_document_div(html_content: str) -> str: + pos = html_content.find(">") + if pos != -1: + return html_content[: pos + 1] + logger.error("No '>' found in the HTML content.") + raise ValueError("No '>' found in the HTML content.") + + +def extract_page_div(html_content: str) -> str: + soup = BeautifulSoup(html_content, "html.parser") + page_divs = soup.find_all("div", class_="Page") + if len(page_divs) != 1: + logger.error( + "Expected exactly one
element with class 'Page'. Found %d.", len(page_divs) + ) + raise ValueError("Expected exactly one
element with class 'Page'.") + return str(page_divs[0]) + + +def fold_document_div( + html_document_start: str, html_document_end: str, html_per_page: List[str] +) -> str: + html_document = html_document_start + for page_html in html_per_page: + html_document += page_html + html_document += html_document_end + return html_document + + +def group_elements_by_page( + unstructured_elements: Sequence[elements.Element], +) -> Sequence[Sequence[elements.Element]]: + pages_dict = defaultdict(list) + + for element in unstructured_elements: + page_number = element.metadata.page_number + pages_dict[page_number].append(element) + + pages_list = list(pages_dict.values()) + return pages_list + + +def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str: + """Renders HTML from a JSON file with unstructured elements. + + Args: + filepath (str): path to JSON file with unstructured elements. + + Returns: + str: HTML content. + """ + if filepath is None and text is None: + logger.error("Either filepath or text must be provided.") + raise ValueError("Either filepath or text must be provided.") + if filepath is not None and text is not None: + logger.error("Both filepath and text cannot be provided.") + raise ValueError("Both filepath and text cannot be provided.") + if filepath is not None: + logger.info("Rendering HTML from file: %s", filepath) + else: + logger.info("Rendering HTML from text.") + + unstructured_elements = elements_from_json(filename=filepath, text=text) + unstructured_elements_per_page = group_elements_by_page(unstructured_elements) + # parsed_ontology = unstructured_elements_to_ontology(unstructured_elements) + parsed_ontology_per_page = [ + unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page + ] + html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page] + + html_document_start = extract_document_div(html_per_page[0]) + html_document_end = "
" + html_per_page = [extract_page_div(page) for page in html_per_page] + + return fold_document_div(html_document_start, html_document_end, html_per_page) + + +def _main(): + if os.getenv("PROCESS_FROM_STDIN") == "true": + logger.info("Processing from STDIN (PROCESS_FROM_STDIN is set to 'true')") + if select.select([sys.stdin], [], [], 0.1)[0]: + content = sys.stdin.read() + html = rendered_html(text=content) + sys.stdout.write(html) + else: + logger.error("No input provided via STDIN. Exiting.") + sys.exit(1) + else: + logger.info("Processing from command line arguments") + parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.") + parser.add_argument( + "filepath", help="Path to JSON file with unstructured elements.", type=str + ) + parser.add_argument( + "--outdir", + help="Path to directory where the rendered html will be stored.", + type=str, + default=None, + nargs="?", + ) + args = parser.parse_args() + + html = rendered_html(filepath=args.filepath) + if args.outdir is None: + args.outdir = os.path.dirname(args.filepath) + os.makedirs(args.outdir, exist_ok=True) + outpath = os.path.join( + args.outdir, os.path.basename(args.filepath).replace(".json", ".html") + ) + with open(outpath, "w") as f: + f.write(html) + logger.info("HTML rendered and saved to: %s", outpath) + + +if __name__ == "__main__": + _main()