Skip to content

Commit

Permalink
add script to render html from unstructured elements (#3799)
Browse files Browse the repository at this point in the history
Script to render HTML from unstructured elements.

NOTE: This script is not intended to be used as a module.
NOTE: This script is only intended to be used with outputs with
non-empty `metadata.text_as_html`.

TODO: It was noted that unstructured_elements_to_ontology func always
returns a single page
This script is using helper functions to handle multiple pages. I am not
sure if this was intended, or it is a bug - if it is a bug it would
require bit longer debugging - to make it usable fast I used
workarounds.

Usage: test with any outputs with non-empty `metadata.text_as_html`.
Example files attached.

`[Example-Bill-of-Lading-Waste.docx.pdf.json](https://github.com/user-attachments/files/17922898/Example-Bill-of-Lading-Waste.docx.pdf.json)`


[Breast_Cancer1-5.pdf.json](https://github.com/user-attachments/files/17922899/Breast_Cancer1-5.pdf.json)
  • Loading branch information
mariannaparzych authored Dec 5, 2024
1 parent 0fb814d commit 4140f62
Showing 1 changed file with 146 additions and 0 deletions.
146 changes: 146 additions & 0 deletions scripts/html/rendered_html_from_elements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# pyright: reportPrivateUsage=false

"""
Script to render HTML from unstructured elements.
NOTE: This script is not intended to be used as a module.
NOTE: For now script is only intended to be used with elements generated with
`partition_html(html_parser_version=v2)`
TODO: It was noted that unstructured_elements_to_ontology func always returns a single page
This script is using helper functions to handle multiple pages.
"""

import argparse
import logging
import os
import select
import sys
from collections import defaultdict
from typing import List, Sequence

from bs4 import BeautifulSoup

from unstructured.documents import elements
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
from unstructured.staging.base import elements_from_json

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


def extract_document_div(html_content: str) -> str:
pos = html_content.find(">")
if pos != -1:
return html_content[: pos + 1]
logger.error("No '>' found in the HTML content.")
raise ValueError("No '>' found in the HTML content.")


def extract_page_div(html_content: str) -> str:
soup = BeautifulSoup(html_content, "html.parser")
page_divs = soup.find_all("div", class_="Page")
if len(page_divs) != 1:
logger.error(
"Expected exactly one <div> element with class 'Page'. Found %d.", len(page_divs)
)
raise ValueError("Expected exactly one <div> element with class 'Page'.")
return str(page_divs[0])


def fold_document_div(
html_document_start: str, html_document_end: str, html_per_page: List[str]
) -> str:
html_document = html_document_start
for page_html in html_per_page:
html_document += page_html
html_document += html_document_end
return html_document


def group_elements_by_page(
unstructured_elements: Sequence[elements.Element],
) -> Sequence[Sequence[elements.Element]]:
pages_dict = defaultdict(list)

for element in unstructured_elements:
page_number = element.metadata.page_number
pages_dict[page_number].append(element)

pages_list = list(pages_dict.values())
return pages_list


def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str:
"""Renders HTML from a JSON file with unstructured elements.
Args:
filepath (str): path to JSON file with unstructured elements.
Returns:
str: HTML content.
"""
if filepath is None and text is None:
logger.error("Either filepath or text must be provided.")
raise ValueError("Either filepath or text must be provided.")
if filepath is not None and text is not None:
logger.error("Both filepath and text cannot be provided.")
raise ValueError("Both filepath and text cannot be provided.")
if filepath is not None:
logger.info("Rendering HTML from file: %s", filepath)
else:
logger.info("Rendering HTML from text.")

unstructured_elements = elements_from_json(filename=filepath, text=text)
unstructured_elements_per_page = group_elements_by_page(unstructured_elements)
# parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
parsed_ontology_per_page = [
unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page
]
html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page]

html_document_start = extract_document_div(html_per_page[0])
html_document_end = "</div>"
html_per_page = [extract_page_div(page) for page in html_per_page]

return fold_document_div(html_document_start, html_document_end, html_per_page)


def _main():
if os.getenv("PROCESS_FROM_STDIN") == "true":
logger.info("Processing from STDIN (PROCESS_FROM_STDIN is set to 'true')")
if select.select([sys.stdin], [], [], 0.1)[0]:
content = sys.stdin.read()
html = rendered_html(text=content)
sys.stdout.write(html)
else:
logger.error("No input provided via STDIN. Exiting.")
sys.exit(1)
else:
logger.info("Processing from command line arguments")
parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.")
parser.add_argument(
"filepath", help="Path to JSON file with unstructured elements.", type=str
)
parser.add_argument(
"--outdir",
help="Path to directory where the rendered html will be stored.",
type=str,
default=None,
nargs="?",
)
args = parser.parse_args()

html = rendered_html(filepath=args.filepath)
if args.outdir is None:
args.outdir = os.path.dirname(args.filepath)
os.makedirs(args.outdir, exist_ok=True)
outpath = os.path.join(
args.outdir, os.path.basename(args.filepath).replace(".json", ".html")
)
with open(outpath, "w") as f:
f.write(html)
logger.info("HTML rendered and saved to: %s", outpath)


if __name__ == "__main__":
_main()

0 comments on commit 4140f62

Please sign in to comment.