-
Notifications
You must be signed in to change notification settings - Fork 804
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add script to render html from unstructured elements #3799
Changes from all commits
d98474c
115a390
cf48f51
aa329e4
ca9e8ef
9a4eca6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
# pyright: reportPrivateUsage=false | ||
|
||
""" | ||
Script to render HTML from unstructured elements. | ||
NOTE: This script is not intended to be used as a module. | ||
NOTE: For now script is only intended to be used with elements generated with | ||
`partition_html(html_parser_version=v2)` | ||
TODO: It was noted that unstructured_elements_to_ontology func always returns a single page | ||
This script is using helper functions to handle multiple pages. | ||
""" | ||
|
||
import argparse | ||
import logging | ||
import os | ||
import select | ||
import sys | ||
from collections import defaultdict | ||
from typing import List, Sequence | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
from unstructured.documents import elements | ||
from unstructured.partition.html.transformations import unstructured_elements_to_ontology | ||
from unstructured.staging.base import elements_from_json | ||
|
||
# Configure logging | ||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def extract_document_div(html_content: str) -> str: | ||
pos = html_content.find(">") | ||
if pos != -1: | ||
return html_content[: pos + 1] | ||
logger.error("No '>' found in the HTML content.") | ||
raise ValueError("No '>' found in the HTML content.") | ||
|
||
|
||
def extract_page_div(html_content: str) -> str: | ||
soup = BeautifulSoup(html_content, "html.parser") | ||
page_divs = soup.find_all("div", class_="Page") | ||
if len(page_divs) != 1: | ||
logger.error( | ||
"Expected exactly one <div> element with class 'Page'. Found %d.", len(page_divs) | ||
) | ||
raise ValueError("Expected exactly one <div> element with class 'Page'.") | ||
return str(page_divs[0]) | ||
|
||
|
||
def fold_document_div( | ||
html_document_start: str, html_document_end: str, html_per_page: List[str] | ||
) -> str: | ||
html_document = html_document_start | ||
for page_html in html_per_page: | ||
html_document += page_html | ||
html_document += html_document_end | ||
return html_document | ||
|
||
|
||
def group_elements_by_page( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hint for the future https://docs.python.org/3/library/itertools.html#itertools.groupby |
||
unstructured_elements: Sequence[elements.Element], | ||
) -> Sequence[Sequence[elements.Element]]: | ||
pages_dict = defaultdict(list) | ||
|
||
for element in unstructured_elements: | ||
page_number = element.metadata.page_number | ||
pages_dict[page_number].append(element) | ||
|
||
pages_list = list(pages_dict.values()) | ||
return pages_list | ||
|
||
|
||
def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str: | ||
"""Renders HTML from a JSON file with unstructured elements. | ||
|
||
Args: | ||
filepath (str): path to JSON file with unstructured elements. | ||
|
||
Returns: | ||
str: HTML content. | ||
""" | ||
if filepath is None and text is None: | ||
logger.error("Either filepath or text must be provided.") | ||
raise ValueError("Either filepath or text must be provided.") | ||
if filepath is not None and text is not None: | ||
logger.error("Both filepath and text cannot be provided.") | ||
raise ValueError("Both filepath and text cannot be provided.") | ||
if filepath is not None: | ||
logger.info("Rendering HTML from file: %s", filepath) | ||
else: | ||
logger.info("Rendering HTML from text.") | ||
Comment on lines
+82
to
+91
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not a huge fan of lots of ifs that can be simply avoided. I see that method expecting 'stringified' json is kinda unusual, we could also always expect filename (temp dir would have to be used with STDIN ) |
||
|
||
unstructured_elements = elements_from_json(filename=filepath, text=text) | ||
unstructured_elements_per_page = group_elements_by_page(unstructured_elements) | ||
# parsed_ontology = unstructured_elements_to_ontology(unstructured_elements) | ||
parsed_ontology_per_page = [ | ||
unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page | ||
] | ||
html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page] | ||
|
||
html_document_start = extract_document_div(html_per_page[0]) | ||
html_document_end = "</div>" | ||
html_per_page = [extract_page_div(page) for page in html_per_page] | ||
|
||
return fold_document_div(html_document_start, html_document_end, html_per_page) | ||
|
||
|
||
def _main(): | ||
if os.getenv("PROCESS_FROM_STDIN") == "true": | ||
logger.info("Processing from STDIN (PROCESS_FROM_STDIN is set to 'true')") | ||
if select.select([sys.stdin], [], [], 0.1)[0]: | ||
content = sys.stdin.read() | ||
html = rendered_html(text=content) | ||
sys.stdout.write(html) | ||
else: | ||
logger.error("No input provided via STDIN. Exiting.") | ||
sys.exit(1) | ||
else: | ||
logger.info("Processing from command line arguments") | ||
parser = argparse.ArgumentParser(description="Render HTML from unstructured elements.") | ||
parser.add_argument( | ||
"filepath", help="Path to JSON file with unstructured elements.", type=str | ||
) | ||
parser.add_argument( | ||
"--outdir", | ||
help="Path to directory where the rendered html will be stored.", | ||
type=str, | ||
default=None, | ||
nargs="?", | ||
) | ||
args = parser.parse_args() | ||
|
||
html = rendered_html(filepath=args.filepath) | ||
if args.outdir is None: | ||
args.outdir = os.path.dirname(args.filepath) | ||
os.makedirs(args.outdir, exist_ok=True) | ||
outpath = os.path.join( | ||
args.outdir, os.path.basename(args.filepath).replace(".json", ".html") | ||
) | ||
with open(outpath, "w") as f: | ||
f.write(html) | ||
logger.info("HTML rendered and saved to: %s", outpath) | ||
|
||
|
||
if __name__ == "__main__": | ||
_main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be nice to add example usage in docstrings.
I personally tested that this way