diff --git a/config/config.json b/config/config.json new file mode 100644 index 0000000..eb98b90 --- /dev/null +++ b/config/config.json @@ -0,0 +1,9 @@ +{ + "pdf_model_params": { + "layout_ep": "http://192.168.106.12:9001/v2.1/models/elem_layout_v1/infer", + "cell_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_cell_detect_v1/infer", + "rowcol_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_rowcol_detect_v1/infer", + "table_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_detect_v1/infer", + "ocr_model_ep": "" + } +} diff --git a/src/bisheng_unstructured/api/__init__.py b/src/bisheng_unstructured/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/bisheng_unstructured/api/main.py b/src/bisheng_unstructured/api/main.py new file mode 100644 index 0000000..cc29e2f --- /dev/null +++ b/src/bisheng_unstructured/api/main.py @@ -0,0 +1,84 @@ +import base64 +import json +import os +import tempfile + +import requests +from fastapi import Depends, FastAPI, Header, HTTPException, Request, status +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import ORJSONResponse + +from .pipeline import Pipeline +from .types import UnstructuredInput, UnstructuredOutput + +# Fastapi App + + +def handle_http_exception(req: Request, exc: HTTPException) -> ORJSONResponse: + msg = {"status_code": exc.status_code, "status_message": exc.detail} + return ORJSONResponse(content=msg) + + +_EXCEPTION_HANDLERS = {HTTPException: handle_http_exception} + + +def create_app(): + """Create the FastAPI app and include the router.""" + + app = FastAPI( + default_response_class=ORJSONResponse, + exception_handlers=_EXCEPTION_HANDLERS, + ) + + origins = [ + "*", + ] + + @app.get("/health") + def get_health(): + return {"status": "OK"} + + app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + return app + + +app = create_app() + +config_file = "./config/config.json" +pipeline = Pipeline(config_file) + + +@app.post("/v1/etl4llm/predict", response_model=UnstructuredOutput) +async def etl4_llm(inp: UnstructuredInput): + filename = inp.filename + b64_data = inp.b64_data + file_type = filename.rsplit(".", 1)[1].lower() + + if not inp.b64_data and not inp.url: + raise Exception("url or b64_data at least one must be given") + + with tempfile.TemporaryDirectory() as tmpdir: + file_path = os.path.join(tmpdir, filename) + if b64_data: + with open(file_path, "wb") as fout: + fout.write(base64.b64decode(b64_data[0])) + else: + headers = inp.parameters.get("headers", {}) + ssl_verify = inp.parameters.get("ssl_verify", True) + response = requests.get(inp.url, headers=headers, verify=ssl_verify) + if not response.ok: + raise Exception(f"URL return an error: {response.status_code}") + with open(file_path, "wb") as fout: + fout.write(response.text) + + inp.file_path = file_path + inp.file_type = file_type + + return pipeline.predict(inp) diff --git a/src/bisheng_unstructured/api/pipeline.py b/src/bisheng_unstructured/api/pipeline.py new file mode 100644 index 0000000..16f23a0 --- /dev/null +++ b/src/bisheng_unstructured/api/pipeline.py @@ -0,0 +1,79 @@ +import json +from typing import Dict + +from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html +from bisheng_unstructured.documents.pdf_parser.image import ImageDocument +from bisheng_unstructured.documents.pdf_parser.pdf import PDFDocument +from bisheng_unstructured.partition.doc import partition_doc +from bisheng_unstructured.partition.docx import partition_docx +from bisheng_unstructured.partition.html import partition_html +from bisheng_unstructured.partition.md import partition_md +from bisheng_unstructured.partition.ppt import partition_ppt +from bisheng_unstructured.partition.pptx import partition_pptx +from bisheng_unstructured.partition.xlsx import partition_xlsx +from bisheng_unstructured.staging.base import convert_to_isd + +from .types import UnstructuredInput, UnstructuredOutput + + +def partition_pdf(filename, model_params, part_params={}, **kwargs): + doc = PDFDocument(file=filename, model_params=model_params, **part_params, **kwargs) + doc.pages + return doc.elements + + +def partition_image(filename, model_params, part_params={}, **kwargs): + doc = ImageDocument(file=filename, model_params=model_params, **part_params, **kwargs) + doc.pages + return doc.elements + + +PARTITION_MAP = { + "pdf": partition_pdf, + "png": partition_image, + "jpeg": partition_image, + "jpg": partition_image, + "tiff": partition_image, + "doc": partition_doc, + "docx": partition_docx, + "ppt": partition_ppt, + "pptx": partition_pptx, + "xlsx": partition_xlsx, + "md": partition_md, + "html": partition_html, +} + + +class Pipeline(object): + def __init__(self, config_file: str): + self.config = json.load(open(config_file)) + self.pdf_model_params = self.config.get("pdf_model_params") + + def predict(self, inp: UnstructuredInput) -> UnstructuredOutput: + if inp.file_type not in PARTITION_MAP: + raise Exception(f"file type[{inp.file_type}] not supported") + + filename = inp.file_path + file_type = inp.file_type + part_params = inp.parameters + part_inp = {"filename": filename, **inp.parameters} + part_func = PARTITION_MAP.get(file_type) + if part_func == partition_pdf or part_func == partition_image: + part_inp.update({"model_params": self.pdf_model_params}) + + try: + elements = part_func(**part_inp) + mode = inp.mode + if mode == "partition": + isd = convert_to_isd(elements) + result = UnstructuredOutput(partitions=isd) + elif mode == "text": + text = save_to_txt(elements) + result = UnstructuredOutput(text=text) + elif mode == "vis": + html_text = visualize_html(elements) + result = UnstructuredOutput(html_text=html_text) + + return result + except Exception as e: + return UnstructuredOutput(status_code=400, status_message=str(e)) diff --git a/src/bisheng_unstructured/api/types.py b/src/bisheng_unstructured/api/types.py new file mode 100644 index 0000000..cb5a183 --- /dev/null +++ b/src/bisheng_unstructured/api/types.py @@ -0,0 +1,21 @@ +from typing import Any, Dict, List, Optional, Union + +from pydantic import BaseModel + + +class UnstructuredInput(BaseModel): + filename: str + url: Optional[str] = None + b64_data: Optional[List[str]] = None + parameters: Optional[Dict] = {} + mode: str = "text" # text, partition, vis + file_path: Optional[str] = None + file_type: Optional[str] = None + + +class UnstructuredOutput(BaseModel): + status_code: int = 200 + status_message: str = "success" + text: Optional[str] = None + html_text: Optional[str] = None + partitions: List[Dict[str, Any]] = [] diff --git a/src/bisheng_unstructured/documents/html_utils.py b/src/bisheng_unstructured/documents/html_utils.py index 935ec48..ae8f4b7 100644 --- a/src/bisheng_unstructured/documents/html_utils.py +++ b/src/bisheng_unstructured/documents/html_utils.py @@ -1,7 +1,7 @@ from bisheng_unstructured.documents.markdown import transform_html_table_to_md -def visualize_html(elements, output_file): +def visualize_html(elements, output_file=None): html_prefix = """ @@ -50,11 +50,15 @@ def visualize_html(elements, output_file): body_content = "\n".join(texts) html_str = html_prefix + body_content + html_suffix - with open(output_file, "w") as fout: - fout.write(html_str) + if output_file: + with open(output_file, "w") as fout: + fout.write(html_str) + else: + return html_str -def save_to_txt(elements, output_file): + +def save_to_txt(elements, output_file=None): text_elem_sep = "\n" content_page = [] is_first_elem = True @@ -77,5 +81,8 @@ def save_to_txt(elements, output_file): last_label = label - with open(output_file, "w") as fout: - fout.write("".join(content_page)) + if output_file: + with open(output_file, "w") as fout: + fout.write("".join(content_page)) + else: + return "".join(content_page) diff --git a/tests/test_client.py b/tests/test_client.py new file mode 100644 index 0000000..690786d --- /dev/null +++ b/tests/test_client.py @@ -0,0 +1,46 @@ +import base64 +import os + +import requests + + +def test1(): + url = "http://192.168.106.12:10001/v1/etl4llm/predict" + filename = "examples/docs/maoxuan_sample1.jpg" + b64_data = base64.b64encode(open(filename, "rb").read()).decode() + inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="text") + resp = requests.post(url, json=inp).json() + print(resp) + + +def test2(): + url = "http://192.168.106.12:10001/v1/etl4llm/predict" + filename = "./examples/docs/毛泽东课件.pptx" + b64_data = base64.b64encode(open(filename, "rb").read()).decode() + inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="text") + resp = requests.post(url, json=inp).json() + print(resp) + + +def test3(): + url = "http://192.168.106.12:10001/v1/etl4llm/predict" + filename = "./examples/docs/毛泽东课件.pptx" + b64_data = base64.b64encode(open(filename, "rb").read()).decode() + inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="partition") + resp = requests.post(url, json=inp).json() + print(resp) + + +def test4(): + url = "http://192.168.106.12:10001/v1/etl4llm/predict" + filename = "./examples/docs/毛泽东课件.pptx" + b64_data = base64.b64encode(open(filename, "rb").read()).decode() + inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="vis") + resp = requests.post(url, json=inp).json() + print(resp) + + +# test1() +# test2() +# test3() +test4() diff --git a/tests/test_image.py b/tests/test_image.py index 6f192c5..4b2c89d 100644 --- a/tests/test_image.py +++ b/tests/test_image.py @@ -1,58 +1,54 @@ +from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html from bisheng_unstructured.documents.pdf_parser.image import ImageDocument -from bisheng_unstructured.documents.html_utils import visualize_html, save_to_txt -TEST_RT_URL = 'http://192.168.106.12:9001/v2.1/models/' +TEST_RT_URL = "http://192.168.106.12:9001/v2.1/models/" def test_image(): - url = TEST_RT_URL - layout_ep = url + 'elem_layout_v1/infer' - cell_model_ep = url + 'elem_table_cell_detect_v1/infer' - rowcol_model_ep = url + 'elem_table_rowcol_detect_v1/infer' - table_model_ep = url + 'elem_table_detect_v1/infer' - - model_params = { - 'layout_ep': layout_ep, - 'cell_model_ep': cell_model_ep, - 'rowcol_model_ep': rowcol_model_ep, - 'table_model_ep': table_model_ep, - } - - filename = "examples/docs/maoxuan_intro_with_table.jpg" - doc = ImageDocument( - file=filename, - model_params=model_params) - pages = doc.pages - elements = doc.elements - - visualize_html(elements, 'data/maoxuan_intro_with_table.html') - save_to_txt(elements, 'data/maoxuan_intro_with_table.txt') + url = TEST_RT_URL + layout_ep = url + "elem_layout_v1/infer" + cell_model_ep = url + "elem_table_cell_detect_v1/infer" + rowcol_model_ep = url + "elem_table_rowcol_detect_v1/infer" + table_model_ep = url + "elem_table_detect_v1/infer" + + model_params = { + "layout_ep": layout_ep, + "cell_model_ep": cell_model_ep, + "rowcol_model_ep": rowcol_model_ep, + "table_model_ep": table_model_ep, + } + + filename = "examples/docs/maoxuan_intro_with_table.jpg" + doc = ImageDocument(file=filename, model_params=model_params) + pages = doc.pages + elements = doc.elements + + visualize_html(elements, "data/maoxuan_intro_with_table.html") + save_to_txt(elements, "data/maoxuan_intro_with_table.txt") def test_image2(): - url = TEST_RT_URL - layout_ep = url + 'elem_layout_v1/infer' - cell_model_ep = url + 'elem_table_cell_detect_v1/infer' - rowcol_model_ep = url + 'elem_table_rowcol_detect_v1/infer' - table_model_ep = url + 'elem_table_detect_v1/infer' - - model_params = { - 'layout_ep': layout_ep, - 'cell_model_ep': cell_model_ep, - 'rowcol_model_ep': rowcol_model_ep, - 'table_model_ep': table_model_ep, - } - - filename = "examples/docs/maoxuan_sample1.jpg" - doc = ImageDocument( - file=filename, - model_params=model_params) - pages = doc.pages - elements = doc.elements - - visualize_html(elements, 'data/maoxuan_sample1.html') - save_to_txt(elements, 'data/maoxuan_sample1.txt') - - -test_image2() -test_image() \ No newline at end of file + url = TEST_RT_URL + layout_ep = url + "elem_layout_v1/infer" + cell_model_ep = url + "elem_table_cell_detect_v1/infer" + rowcol_model_ep = url + "elem_table_rowcol_detect_v1/infer" + table_model_ep = url + "elem_table_detect_v1/infer" + + model_params = { + "layout_ep": layout_ep, + "cell_model_ep": cell_model_ep, + "rowcol_model_ep": rowcol_model_ep, + "table_model_ep": table_model_ep, + } + + filename = "examples/docs/maoxuan_sample1.jpg" + doc = ImageDocument(file=filename, model_params=model_params) + pages = doc.pages + elements = doc.elements + + visualize_html(elements, "data/maoxuan_sample1.html") + save_to_txt(elements, "data/maoxuan_sample1.txt") + + +# test_image2() +test_image() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..d7900c2 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,37 @@ +from bisheng_unstructured.api.pipeline import Pipeline +from bisheng_unstructured.api.types import UnstructuredInput, UnstructuredOutput + +CONFIG = "./config/config.json" + + +def test1(): + file_path = "examples/docs/maoxuan_intro_with_table.jpg" + inp = UnstructuredInput(filename="", file_path=file_path, file_type="jpg", mode="text") + pipeline = Pipeline(CONFIG) + outp = pipeline.predict(inp) + print(outp.dict()) + + +def test2(): + file_path = "examples/docs/maoxuan_scan.pdf" + inp = UnstructuredInput( + filename="", file_path=file_path, file_type="pdf", mode="text", parameters={"n": 20} + ) + + pipeline = Pipeline(CONFIG) + outp = pipeline.predict(inp) + print(outp.dict()) + + +def test3(): + file_path = "./examples/docs/毛泽东课件.pptx" + inp = UnstructuredInput(filename="", file_path=file_path, file_type="pptx", mode="text") + + pipeline = Pipeline(CONFIG) + outp = pipeline.predict(inp) + print(outp.dict()) + + +# test1() +# test2() +test3()