support api

dataelement · Sep 17, 2023 · ecb07cc · ecb07cc
1 parent e680580
commit ecb07cc
Show file tree

Hide file tree

Showing 9 changed files with 335 additions and 56 deletions.
diff --git a/config/config.json b/config/config.json
@@ -0,0 +1,9 @@
+{
+    "pdf_model_params": {
+        "layout_ep": "http://192.168.106.12:9001/v2.1/models/elem_layout_v1/infer",
+        "cell_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_cell_detect_v1/infer",
+        "rowcol_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_rowcol_detect_v1/infer",
+        "table_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_detect_v1/infer",
+        "ocr_model_ep": ""
+    }
+}
diff --git a/src/bisheng_unstructured/api/__init__.py b/src/bisheng_unstructured/api/__init__.py
diff --git a/src/bisheng_unstructured/api/main.py b/src/bisheng_unstructured/api/main.py
@@ -0,0 +1,84 @@
+import base64
+import json
+import os
+import tempfile
+
+import requests
+from fastapi import Depends, FastAPI, Header, HTTPException, Request, status
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import ORJSONResponse
+
+from .pipeline import Pipeline
+from .types import UnstructuredInput, UnstructuredOutput
+
+# Fastapi App
+
+
+def handle_http_exception(req: Request, exc: HTTPException) -> ORJSONResponse:
+    msg = {"status_code": exc.status_code, "status_message": exc.detail}
+    return ORJSONResponse(content=msg)
+
+
+_EXCEPTION_HANDLERS = {HTTPException: handle_http_exception}
+
+
+def create_app():
+    """Create the FastAPI app and include the router."""
+
+    app = FastAPI(
+        default_response_class=ORJSONResponse,
+        exception_handlers=_EXCEPTION_HANDLERS,
+    )
+
+    origins = [
+        "*",
+    ]
+
+    @app.get("/health")
+    def get_health():
+        return {"status": "OK"}
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=origins,
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    return app
+
+
+app = create_app()
+
+config_file = "./config/config.json"
+pipeline = Pipeline(config_file)
+
+
+@app.post("/v1/etl4llm/predict", response_model=UnstructuredOutput)
+async def etl4_llm(inp: UnstructuredInput):
+    filename = inp.filename
+    b64_data = inp.b64_data
+    file_type = filename.rsplit(".", 1)[1].lower()
+
+    if not inp.b64_data and not inp.url:
+        raise Exception("url or b64_data at least one must be given")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        file_path = os.path.join(tmpdir, filename)
+        if b64_data:
+            with open(file_path, "wb") as fout:
+                fout.write(base64.b64decode(b64_data[0]))
+        else:
+            headers = inp.parameters.get("headers", {})
+            ssl_verify = inp.parameters.get("ssl_verify", True)
+            response = requests.get(inp.url, headers=headers, verify=ssl_verify)
+            if not response.ok:
+                raise Exception(f"URL return an error: {response.status_code}")
+            with open(file_path, "wb") as fout:
+                fout.write(response.text)
+
+        inp.file_path = file_path
+        inp.file_type = file_type
+
+        return pipeline.predict(inp)
diff --git a/src/bisheng_unstructured/api/pipeline.py b/src/bisheng_unstructured/api/pipeline.py
@@ -0,0 +1,79 @@
+import json
+from typing import Dict
+
+from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html
+from bisheng_unstructured.documents.pdf_parser.image import ImageDocument
+from bisheng_unstructured.documents.pdf_parser.pdf import PDFDocument
+from bisheng_unstructured.partition.doc import partition_doc
+from bisheng_unstructured.partition.docx import partition_docx
+from bisheng_unstructured.partition.html import partition_html
+from bisheng_unstructured.partition.md import partition_md
+from bisheng_unstructured.partition.ppt import partition_ppt
+from bisheng_unstructured.partition.pptx import partition_pptx
+from bisheng_unstructured.partition.xlsx import partition_xlsx
+from bisheng_unstructured.staging.base import convert_to_isd
+
+from .types import UnstructuredInput, UnstructuredOutput
+
+
+def partition_pdf(filename, model_params, part_params={}, **kwargs):
+    doc = PDFDocument(file=filename, model_params=model_params, **part_params, **kwargs)
+    doc.pages
+    return doc.elements
+
+
+def partition_image(filename, model_params, part_params={}, **kwargs):
+    doc = ImageDocument(file=filename, model_params=model_params, **part_params, **kwargs)
+    doc.pages
+    return doc.elements
+
+
+PARTITION_MAP = {
+    "pdf": partition_pdf,
+    "png": partition_image,
+    "jpeg": partition_image,
+    "jpg": partition_image,
+    "tiff": partition_image,
+    "doc": partition_doc,
+    "docx": partition_docx,
+    "ppt": partition_ppt,
+    "pptx": partition_pptx,
+    "xlsx": partition_xlsx,
+    "md": partition_md,
+    "html": partition_html,
+}
+
+
+class Pipeline(object):
+    def __init__(self, config_file: str):
+        self.config = json.load(open(config_file))
+        self.pdf_model_params = self.config.get("pdf_model_params")
+
+    def predict(self, inp: UnstructuredInput) -> UnstructuredOutput:
+        if inp.file_type not in PARTITION_MAP:
+            raise Exception(f"file type[{inp.file_type}] not supported")
+
+        filename = inp.file_path
+        file_type = inp.file_type
+        part_params = inp.parameters
+        part_inp = {"filename": filename, **inp.parameters}
+        part_func = PARTITION_MAP.get(file_type)
+        if part_func == partition_pdf or part_func == partition_image:
+            part_inp.update({"model_params": self.pdf_model_params})
+
+        try:
+            elements = part_func(**part_inp)
+            mode = inp.mode
+            if mode == "partition":
+                isd = convert_to_isd(elements)
+                result = UnstructuredOutput(partitions=isd)
+            elif mode == "text":
+                text = save_to_txt(elements)
+                result = UnstructuredOutput(text=text)
+            elif mode == "vis":
+                html_text = visualize_html(elements)
+                result = UnstructuredOutput(html_text=html_text)
+
+            return result
+        except Exception as e:
+            return UnstructuredOutput(status_code=400, status_message=str(e))
diff --git a/src/bisheng_unstructured/api/types.py b/src/bisheng_unstructured/api/types.py
@@ -0,0 +1,21 @@
+from typing import Any, Dict, List, Optional, Union
+
+from pydantic import BaseModel
+
+
+class UnstructuredInput(BaseModel):
+    filename: str
+    url: Optional[str] = None
+    b64_data: Optional[List[str]] = None
+    parameters: Optional[Dict] = {}
+    mode: str = "text"  # text, partition, vis
+    file_path: Optional[str] = None
+    file_type: Optional[str] = None
+
+
+class UnstructuredOutput(BaseModel):
+    status_code: int = 200
+    status_message: str = "success"
+    text: Optional[str] = None
+    html_text: Optional[str] = None
+    partitions: List[Dict[str, Any]] = []
diff --git a/src/bisheng_unstructured/documents/html_utils.py b/src/bisheng_unstructured/documents/html_utils.py
@@ -1,7 +1,7 @@
 from bisheng_unstructured.documents.markdown import transform_html_table_to_md
 
 
-def visualize_html(elements, output_file):
+def visualize_html(elements, output_file=None):
     html_prefix = """
     <html>
     <head>
@@ -50,11 +50,15 @@ def visualize_html(elements, output_file):
 
     body_content = "\n".join(texts)
     html_str = html_prefix + body_content + html_suffix
-    with open(output_file, "w") as fout:
-        fout.write(html_str)
 
+    if output_file:
+        with open(output_file, "w") as fout:
+            fout.write(html_str)
+    else:
+        return html_str
 
-def save_to_txt(elements, output_file):
+
+def save_to_txt(elements, output_file=None):
     text_elem_sep = "\n"
     content_page = []
     is_first_elem = True
@@ -77,5 +81,8 @@ def save_to_txt(elements, output_file):
 
         last_label = label
 
-    with open(output_file, "w") as fout:
-        fout.write("".join(content_page))
+    if output_file:
+        with open(output_file, "w") as fout:
+            fout.write("".join(content_page))
+    else:
+        return "".join(content_page)
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -0,0 +1,46 @@
+import base64
+import os
+
+import requests
+
+
+def test1():
+    url = "http://192.168.106.12:10001/v1/etl4llm/predict"
+    filename = "examples/docs/maoxuan_sample1.jpg"
+    b64_data = base64.b64encode(open(filename, "rb").read()).decode()
+    inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="text")
+    resp = requests.post(url, json=inp).json()
+    print(resp)
+
+
+def test2():
+    url = "http://192.168.106.12:10001/v1/etl4llm/predict"
+    filename = "./examples/docs/毛泽东课件.pptx"
+    b64_data = base64.b64encode(open(filename, "rb").read()).decode()
+    inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="text")
+    resp = requests.post(url, json=inp).json()
+    print(resp)
+
+
+def test3():
+    url = "http://192.168.106.12:10001/v1/etl4llm/predict"
+    filename = "./examples/docs/毛泽东课件.pptx"
+    b64_data = base64.b64encode(open(filename, "rb").read()).decode()
+    inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="partition")
+    resp = requests.post(url, json=inp).json()
+    print(resp)
+
+
+def test4():
+    url = "http://192.168.106.12:10001/v1/etl4llm/predict"
+    filename = "./examples/docs/毛泽东课件.pptx"
+    b64_data = base64.b64encode(open(filename, "rb").read()).decode()
+    inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="vis")
+    resp = requests.post(url, json=inp).json()
+    print(resp)
+
+
+# test1()
+# test2()
+# test3()
+test4()
diff --git a/tests/test_image.py b/tests/test_image.py
@@ -1,58 +1,54 @@
+from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html
 from bisheng_unstructured.documents.pdf_parser.image import ImageDocument
-from bisheng_unstructured.documents.html_utils import visualize_html, save_to_txt
 
-TEST_RT_URL = 'http://192.168.106.12:9001/v2.1/models/'
+TEST_RT_URL = "http://192.168.106.12:9001/v2.1/models/"
 
 
 def test_image():
-  url = TEST_RT_URL
-  layout_ep = url + 'elem_layout_v1/infer'
-  cell_model_ep = url + 'elem_table_cell_detect_v1/infer'
-  rowcol_model_ep = url + 'elem_table_rowcol_detect_v1/infer'
-  table_model_ep = url + 'elem_table_detect_v1/infer'
-
-  model_params = {
-    'layout_ep': layout_ep,
-    'cell_model_ep': cell_model_ep,
-    'rowcol_model_ep': rowcol_model_ep,
-    'table_model_ep': table_model_ep,
-  }
-
-  filename = "examples/docs/maoxuan_intro_with_table.jpg"
-  doc = ImageDocument(
-    file=filename, 
-    model_params=model_params)
-  pages = doc.pages
-  elements = doc.elements
-
-  visualize_html(elements, 'data/maoxuan_intro_with_table.html')
-  save_to_txt(elements, 'data/maoxuan_intro_with_table.txt')
+    url = TEST_RT_URL
+    layout_ep = url + "elem_layout_v1/infer"
+    cell_model_ep = url + "elem_table_cell_detect_v1/infer"
+    rowcol_model_ep = url + "elem_table_rowcol_detect_v1/infer"
+    table_model_ep = url + "elem_table_detect_v1/infer"
+
+    model_params = {
+        "layout_ep": layout_ep,
+        "cell_model_ep": cell_model_ep,
+        "rowcol_model_ep": rowcol_model_ep,
+        "table_model_ep": table_model_ep,
+    }
+
+    filename = "examples/docs/maoxuan_intro_with_table.jpg"
+    doc = ImageDocument(file=filename, model_params=model_params)
+    pages = doc.pages
+    elements = doc.elements
+
+    visualize_html(elements, "data/maoxuan_intro_with_table.html")
+    save_to_txt(elements, "data/maoxuan_intro_with_table.txt")
 
 
 def test_image2():
-  url = TEST_RT_URL
-  layout_ep = url + 'elem_layout_v1/infer'
-  cell_model_ep = url + 'elem_table_cell_detect_v1/infer'
-  rowcol_model_ep = url + 'elem_table_rowcol_detect_v1/infer'
-  table_model_ep = url + 'elem_table_detect_v1/infer'
-
-  model_params = {
-    'layout_ep': layout_ep,
-    'cell_model_ep': cell_model_ep,
-    'rowcol_model_ep': rowcol_model_ep,
-    'table_model_ep': table_model_ep,
-  }
-
-  filename = "examples/docs/maoxuan_sample1.jpg"
-  doc = ImageDocument(
-    file=filename, 
-    model_params=model_params)
-  pages = doc.pages
-  elements = doc.elements
-
-  visualize_html(elements, 'data/maoxuan_sample1.html')
-  save_to_txt(elements, 'data/maoxuan_sample1.txt')
-
-
-test_image2()
-test_image()
+    url = TEST_RT_URL
+    layout_ep = url + "elem_layout_v1/infer"
+    cell_model_ep = url + "elem_table_cell_detect_v1/infer"
+    rowcol_model_ep = url + "elem_table_rowcol_detect_v1/infer"
+    table_model_ep = url + "elem_table_detect_v1/infer"
+
+    model_params = {
+        "layout_ep": layout_ep,
+        "cell_model_ep": cell_model_ep,
+        "rowcol_model_ep": rowcol_model_ep,
+        "table_model_ep": table_model_ep,
+    }
+
+    filename = "examples/docs/maoxuan_sample1.jpg"
+    doc = ImageDocument(file=filename, model_params=model_params)
+    pages = doc.pages
+    elements = doc.elements
+
+    visualize_html(elements, "data/maoxuan_sample1.html")
+    save_to_txt(elements, "data/maoxuan_sample1.txt")
+
+
+# test_image2()
+test_image()