Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add script to render html from unstructured elements #3799

Merged
merged 6 commits into from
Dec 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions scripts/html/rendered_html_from_elements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# pyright: reportPrivateUsage=false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to add example usage in docstrings.
I personally tested that this way

python3 scripts/html/rendered_html_from_elements.py Breast_Cancer1-5.pdf.json --outdir . 
 cat Breast_Cancer1-5.pdf.json | PROCESS_FROM_STDIN=true python3 scripts/html/rendered_html_from_elements.py


"""
Script to render HTML from unstructured elements.
NOTE: This script is not intended to be used as a module.
NOTE: For now script is only intended to be used with elements generated with
`partition_html(html_parser_version=v2)`
TODO: It was noted that unstructured_elements_to_ontology func always returns a single page
This script is using helper functions to handle multiple pages.
"""

import argparse
import logging
import os
import select
import sys
from collections import defaultdict
from typing import List, Sequence

from bs4 import BeautifulSoup

from unstructured.documents import elements
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
from unstructured.staging.base import elements_from_json

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)


def extract_document_div(html_content: str) -> str:
pos = html_content.find(">")
if pos != -1:
return html_content[: pos + 1]
logger.error("No '>' found in the HTML content.")
raise ValueError("No '>' found in the HTML content.")


def extract_page_div(html_content: str) -> str:
soup = BeautifulSoup(html_content, "html.parser")
page_divs = soup.find_all("div", class_="Page")
if len(page_divs) != 1:
logger.error(
"Expected exactly one <div> element with class 'Page'. Found %d.", len(page_divs)
)
raise ValueError("Expected exactly one <div> element with class 'Page'.")
return str(page_divs[0])


def fold_document_div(
html_document_start: str, html_document_end: str, html_per_page: List[str]
) -> str:
html_document = html_document_start
for page_html in html_per_page:
html_document += page_html
html_document += html_document_end
return html_document


def group_elements_by_page(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unstructured_elements: Sequence[elements.Element],
) -> Sequence[Sequence[elements.Element]]:
pages_dict = defaultdict(list)

for element in unstructured_elements:
page_number = element.metadata.page_number
pages_dict[page_number].append(element)

pages_list = list(pages_dict.values())
return pages_list


def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str:
"""Renders HTML from a JSON file with unstructured elements.

Args:
filepath (str): path to JSON file with unstructured elements.

Returns:
str: HTML content.
"""
if filepath is None and text is None:
logger.error("Either filepath or text must be provided.")
raise ValueError("Either filepath or text must be provided.")
if filepath is not None and text is not None:
logger.error("Both filepath and text cannot be provided.")
raise ValueError("Both filepath and text cannot be provided.")
if filepath is not None:
logger.info("Rendering HTML from file: %s", filepath)
else:
logger.info("Rendering HTML from text.")
Comment on lines +82 to +91
Copy link
Contributor

@plutasnyy plutasnyy Nov 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not a huge fan of lots of ifs that can be simply avoided.
From STDIN we have text
From file path we can read text and give here text
So just we could expect always text.

I see that method expecting 'stringified' json is kinda unusual, we could also always expect filename (temp dir would have to be used with STDIN )


unstructured_elements = elements_from_json(filename=filepath, text=text)
unstructured_elements_per_page = group_elements_by_page(unstructured_elements)
# parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
parsed_ontology_per_page = [
unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page
]
html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page]

html_document_start = extract_document_div(html_per_page[0])
html_document_end = "</div>"
html_per_page = [extract_page_div(page) for page in html_per_page]

return fold_document_div(html_document_start, html_document_end, html_per_page)


def _main():
if os.getenv("PROCESS_FROM_STDIN") == "true":
logger.info("Processing from STDIN (PROCESS_FROM_STDIN is set to 'true')")
if select.select([sys.stdin], [], [], 0.1)[0]:
content = sys.stdin.read()
html = rendered_html(text=content)
sys.stdout.write(html)
else:
logger.error("No input provided via STDIN. Exiting.")
sys.exit(1)
else:
logger.info("Processing from command line arguments")
parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.")
parser.add_argument(
"filepath", help="Path to JSON file with unstructured elements.", type=str
)
parser.add_argument(
"--outdir",
help="Path to directory where the rendered html will be stored.",
type=str,
default=None,
nargs="?",
)
args = parser.parse_args()

html = rendered_html(filepath=args.filepath)
if args.outdir is None:
args.outdir = os.path.dirname(args.filepath)
os.makedirs(args.outdir, exist_ok=True)
outpath = os.path.join(
args.outdir, os.path.basename(args.filepath).replace(".json", ".html")
)
with open(outpath, "w") as f:
f.write(html)
logger.info("HTML rendered and saved to: %s", outpath)


if __name__ == "__main__":
_main()
Loading