From d98474cbf0999c7def53a08b8bce8672d15fef06 Mon Sep 17 00:00:00 2001 From: Marianna Parzych Date: Tue, 26 Nov 2024 17:46:17 +0100 Subject: [PATCH 1/5] add script to render html from unstructured elements --- .../html/rendered_html_from_elements.py | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 unstructured/partition/html/rendered_html_from_elements.py diff --git a/unstructured/partition/html/rendered_html_from_elements.py b/unstructured/partition/html/rendered_html_from_elements.py new file mode 100644 index 0000000000..02c40975b3 --- /dev/null +++ b/unstructured/partition/html/rendered_html_from_elements.py @@ -0,0 +1,104 @@ +# pyright: reportPrivateUsage=false + +""" +Script to render HTML from unstructured elements. +NOTE: This script is not intended to be used as a module. +NOTE: This script is only intended to be used with "platinum" strategy of the partitioning pipeline. +TODO: It was noted that unstructured_elements_to_ontology func always returns a single page +This script is using helper functions to handle multiple pages. +""" + +import argparse +import os +from collections import defaultdict +from typing import List, Sequence + +from bs4 import BeautifulSoup + +from unstructured.documents import elements +from unstructured.partition.html.transformations import unstructured_elements_to_ontology +from unstructured.staging.base import elements_from_json + + +def extract_document_div(html_content: str) -> str: + pos = html_content.find(">") + if pos != -1: + return html_content[: pos + 1] + raise ValueError("No '>' found in the HTML content.") + + +def extract_page_div(html_content: str) -> str: + soup = BeautifulSoup(html_content, "html.parser") + page_divs = soup.find_all("div", class_="Page") + assert len(page_divs) == 1, "Expected exactly one
element with class 'Page'." + return str(page_divs[0]) + + +def fold_document_div( + html_document_start: str, html_document_end: str, html_per_page: List[str] +) -> str: + html_document = html_document_start + for page_html in html_per_page: + html_document += page_html + html_document += html_document_end + return html_document + + +def group_elements_by_page( + unstructured_elements: Sequence[elements.Element], +) -> Sequence[Sequence[elements.Element]]: + pages_dict = defaultdict(list) + + for element in unstructured_elements: + page_number = element.metadata.page_number + pages_dict[page_number].append(element) + + pages_list = list(pages_dict.values()) + return pages_list + + +def rendered_html(filepath: str) -> str: + """Renders HTML from a JSON file with unstructured elements. + + Args: + filepath (str): path to JSON file with unstructured elements. + + Returns: + str: HTML content. + """ + unstructured_elements = elements_from_json(filepath) + unstructured_elements_per_page = group_elements_by_page(unstructured_elements) + # parsed_ontology = unstructured_elements_to_ontology(unstructured_elements) + parsed_ontology_per_page = [ + unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page + ] + html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page] + + html_document_start = extract_document_div(html_per_page[0]) + html_document_end = "
" + html_per_page = [extract_page_div(page) for page in html_per_page] + + return fold_document_div(html_document_start, html_document_end, html_per_page) + + +def _main(): + parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.") + parser.add_argument("filepath", help="Path to JSON file with unstructured elements.", type=str) + parser.add_argument( + "--outdir", + help="Path to directory where the rendered html will be stored.", + type=str, + default=None, + ) + args = parser.parse_args() + + html = rendered_html(args.filepath) + if args.outdir is None: + args.outdir = os.path.dirname(args.filepath) + outpath = os.path.join(args.outdir, os.path.basename(args.filepath).replace(".json", ".html")) + with open(outpath, "w") as f: + f.write(html) + + +if __name__ == "__main__": + _main() From cf48f51a1715444ced29b928c58fec9db33a0f4f Mon Sep 17 00:00:00 2001 From: Marianna Date: Wed, 27 Nov 2024 10:15:27 +0100 Subject: [PATCH 2/5] Update unstructured/partition/html/rendered_html_from_elements.py Co-authored-by: cragwolfe --- unstructured/partition/html/rendered_html_from_elements.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/partition/html/rendered_html_from_elements.py b/unstructured/partition/html/rendered_html_from_elements.py index 02c40975b3..ffe2e699b2 100644 --- a/unstructured/partition/html/rendered_html_from_elements.py +++ b/unstructured/partition/html/rendered_html_from_elements.py @@ -3,7 +3,7 @@ """ Script to render HTML from unstructured elements. NOTE: This script is not intended to be used as a module. -NOTE: This script is only intended to be used with "platinum" strategy of the partitioning pipeline. +NOTE: For now script is only intended to be used with elements generated with `partition_html(html_parser_version=v2)` TODO: It was noted that unstructured_elements_to_ontology func always returns a single page This script is using helper functions to handle multiple pages. """ From aa329e46f8bbed560d726d22e0700b5d12cda297 Mon Sep 17 00:00:00 2001 From: Marianna Parzych Date: Wed, 27 Nov 2024 11:18:12 +0100 Subject: [PATCH 3/5] move script to scripts/html --- .../partition => scripts}/html/rendered_html_from_elements.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {unstructured/partition => scripts}/html/rendered_html_from_elements.py (100%) diff --git a/unstructured/partition/html/rendered_html_from_elements.py b/scripts/html/rendered_html_from_elements.py similarity index 100% rename from unstructured/partition/html/rendered_html_from_elements.py rename to scripts/html/rendered_html_from_elements.py From ca9e8ef5cb046a838e7bda29c9d1a0849356fad5 Mon Sep 17 00:00:00 2001 From: Marianna Parzych Date: Wed, 27 Nov 2024 11:58:08 +0100 Subject: [PATCH 4/5] fix line too long error --- scripts/html/rendered_html_from_elements.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/html/rendered_html_from_elements.py b/scripts/html/rendered_html_from_elements.py index ffe2e699b2..b492d35429 100644 --- a/scripts/html/rendered_html_from_elements.py +++ b/scripts/html/rendered_html_from_elements.py @@ -3,9 +3,10 @@ """ Script to render HTML from unstructured elements. NOTE: This script is not intended to be used as a module. -NOTE: For now script is only intended to be used with elements generated with `partition_html(html_parser_version=v2)` +NOTE: For now script is only intended to be used with elements generated with + `partition_html(html_parser_version=v2)` TODO: It was noted that unstructured_elements_to_ontology func always returns a single page -This script is using helper functions to handle multiple pages. + This script is using helper functions to handle multiple pages. """ import argparse From 9a4eca60997f4c0ede4b02218c8fc05d6c6aead3 Mon Sep 17 00:00:00 2001 From: Marianna Parzych Date: Wed, 27 Nov 2024 12:54:14 +0100 Subject: [PATCH 5/5] ad support for STDIN and STDOUT --- scripts/html/rendered_html_from_elements.py | 79 ++++++++++++++++----- 1 file changed, 60 insertions(+), 19 deletions(-) diff --git a/scripts/html/rendered_html_from_elements.py b/scripts/html/rendered_html_from_elements.py index b492d35429..5789a83d14 100644 --- a/scripts/html/rendered_html_from_elements.py +++ b/scripts/html/rendered_html_from_elements.py @@ -10,7 +10,10 @@ """ import argparse +import logging import os +import select +import sys from collections import defaultdict from typing import List, Sequence @@ -20,18 +23,27 @@ from unstructured.partition.html.transformations import unstructured_elements_to_ontology from unstructured.staging.base import elements_from_json +# Configure logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + def extract_document_div(html_content: str) -> str: pos = html_content.find(">") if pos != -1: return html_content[: pos + 1] + logger.error("No '>' found in the HTML content.") raise ValueError("No '>' found in the HTML content.") def extract_page_div(html_content: str) -> str: soup = BeautifulSoup(html_content, "html.parser") page_divs = soup.find_all("div", class_="Page") - assert len(page_divs) == 1, "Expected exactly one
element with class 'Page'." + if len(page_divs) != 1: + logger.error( + "Expected exactly one
element with class 'Page'. Found %d.", len(page_divs) + ) + raise ValueError("Expected exactly one
element with class 'Page'.") return str(page_divs[0]) @@ -58,7 +70,7 @@ def group_elements_by_page( return pages_list -def rendered_html(filepath: str) -> str: +def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str: """Renders HTML from a JSON file with unstructured elements. Args: @@ -67,7 +79,18 @@ def rendered_html(filepath: str) -> str: Returns: str: HTML content. """ - unstructured_elements = elements_from_json(filepath) + if filepath is None and text is None: + logger.error("Either filepath or text must be provided.") + raise ValueError("Either filepath or text must be provided.") + if filepath is not None and text is not None: + logger.error("Both filepath and text cannot be provided.") + raise ValueError("Both filepath and text cannot be provided.") + if filepath is not None: + logger.info("Rendering HTML from file: %s", filepath) + else: + logger.info("Rendering HTML from text.") + + unstructured_elements = elements_from_json(filename=filepath, text=text) unstructured_elements_per_page = group_elements_by_page(unstructured_elements) # parsed_ontology = unstructured_elements_to_ontology(unstructured_elements) parsed_ontology_per_page = [ @@ -83,22 +106,40 @@ def rendered_html(filepath: str) -> str: def _main(): - parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.") - parser.add_argument("filepath", help="Path to JSON file with unstructured elements.", type=str) - parser.add_argument( - "--outdir", - help="Path to directory where the rendered html will be stored.", - type=str, - default=None, - ) - args = parser.parse_args() - - html = rendered_html(args.filepath) - if args.outdir is None: - args.outdir = os.path.dirname(args.filepath) - outpath = os.path.join(args.outdir, os.path.basename(args.filepath).replace(".json", ".html")) - with open(outpath, "w") as f: - f.write(html) + if os.getenv("PROCESS_FROM_STDIN") == "true": + logger.info("Processing from STDIN (PROCESS_FROM_STDIN is set to 'true')") + if select.select([sys.stdin], [], [], 0.1)[0]: + content = sys.stdin.read() + html = rendered_html(text=content) + sys.stdout.write(html) + else: + logger.error("No input provided via STDIN. Exiting.") + sys.exit(1) + else: + logger.info("Processing from command line arguments") + parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.") + parser.add_argument( + "filepath", help="Path to JSON file with unstructured elements.", type=str + ) + parser.add_argument( + "--outdir", + help="Path to directory where the rendered html will be stored.", + type=str, + default=None, + nargs="?", + ) + args = parser.parse_args() + + html = rendered_html(filepath=args.filepath) + if args.outdir is None: + args.outdir = os.path.dirname(args.filepath) + os.makedirs(args.outdir, exist_ok=True) + outpath = os.path.join( + args.outdir, os.path.basename(args.filepath).replace(".json", ".html") + ) + with open(outpath, "w") as f: + f.write(html) + logger.info("HTML rendered and saved to: %s", outpath) if __name__ == "__main__":