Skip to content

Commit

Permalink
support api
Browse files Browse the repository at this point in the history
  • Loading branch information
hrfng committed Sep 17, 2023
1 parent e680580 commit ecb07cc
Show file tree
Hide file tree
Showing 9 changed files with 335 additions and 56 deletions.
9 changes: 9 additions & 0 deletions config/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"pdf_model_params": {
"layout_ep": "http://192.168.106.12:9001/v2.1/models/elem_layout_v1/infer",
"cell_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_cell_detect_v1/infer",
"rowcol_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_rowcol_detect_v1/infer",
"table_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_detect_v1/infer",
"ocr_model_ep": ""
}
}
Empty file.
84 changes: 84 additions & 0 deletions src/bisheng_unstructured/api/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import base64
import json
import os
import tempfile

import requests
from fastapi import Depends, FastAPI, Header, HTTPException, Request, status
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import ORJSONResponse

from .pipeline import Pipeline
from .types import UnstructuredInput, UnstructuredOutput

# Fastapi App


def handle_http_exception(req: Request, exc: HTTPException) -> ORJSONResponse:
msg = {"status_code": exc.status_code, "status_message": exc.detail}
return ORJSONResponse(content=msg)


_EXCEPTION_HANDLERS = {HTTPException: handle_http_exception}


def create_app():
"""Create the FastAPI app and include the router."""

app = FastAPI(
default_response_class=ORJSONResponse,
exception_handlers=_EXCEPTION_HANDLERS,
)

origins = [
"*",
]

@app.get("/health")
def get_health():
return {"status": "OK"}

app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

return app


app = create_app()

config_file = "./config/config.json"
pipeline = Pipeline(config_file)


@app.post("/v1/etl4llm/predict", response_model=UnstructuredOutput)
async def etl4_llm(inp: UnstructuredInput):
filename = inp.filename
b64_data = inp.b64_data
file_type = filename.rsplit(".", 1)[1].lower()

if not inp.b64_data and not inp.url:
raise Exception("url or b64_data at least one must be given")

with tempfile.TemporaryDirectory() as tmpdir:
file_path = os.path.join(tmpdir, filename)
if b64_data:
with open(file_path, "wb") as fout:
fout.write(base64.b64decode(b64_data[0]))
else:
headers = inp.parameters.get("headers", {})
ssl_verify = inp.parameters.get("ssl_verify", True)
response = requests.get(inp.url, headers=headers, verify=ssl_verify)
if not response.ok:
raise Exception(f"URL return an error: {response.status_code}")
with open(file_path, "wb") as fout:
fout.write(response.text)

inp.file_path = file_path
inp.file_type = file_type

return pipeline.predict(inp)
79 changes: 79 additions & 0 deletions src/bisheng_unstructured/api/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import json
from typing import Dict

from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html
from bisheng_unstructured.documents.pdf_parser.image import ImageDocument
from bisheng_unstructured.documents.pdf_parser.pdf import PDFDocument
from bisheng_unstructured.partition.doc import partition_doc
from bisheng_unstructured.partition.docx import partition_docx
from bisheng_unstructured.partition.html import partition_html
from bisheng_unstructured.partition.md import partition_md
from bisheng_unstructured.partition.ppt import partition_ppt
from bisheng_unstructured.partition.pptx import partition_pptx
from bisheng_unstructured.partition.xlsx import partition_xlsx
from bisheng_unstructured.staging.base import convert_to_isd

from .types import UnstructuredInput, UnstructuredOutput


def partition_pdf(filename, model_params, part_params={}, **kwargs):
doc = PDFDocument(file=filename, model_params=model_params, **part_params, **kwargs)
doc.pages
return doc.elements


def partition_image(filename, model_params, part_params={}, **kwargs):
doc = ImageDocument(file=filename, model_params=model_params, **part_params, **kwargs)
doc.pages
return doc.elements


PARTITION_MAP = {
"pdf": partition_pdf,
"png": partition_image,
"jpeg": partition_image,
"jpg": partition_image,
"tiff": partition_image,
"doc": partition_doc,
"docx": partition_docx,
"ppt": partition_ppt,
"pptx": partition_pptx,
"xlsx": partition_xlsx,
"md": partition_md,
"html": partition_html,
}


class Pipeline(object):
def __init__(self, config_file: str):
self.config = json.load(open(config_file))
self.pdf_model_params = self.config.get("pdf_model_params")

def predict(self, inp: UnstructuredInput) -> UnstructuredOutput:
if inp.file_type not in PARTITION_MAP:
raise Exception(f"file type[{inp.file_type}] not supported")

filename = inp.file_path
file_type = inp.file_type
part_params = inp.parameters
part_inp = {"filename": filename, **inp.parameters}
part_func = PARTITION_MAP.get(file_type)
if part_func == partition_pdf or part_func == partition_image:
part_inp.update({"model_params": self.pdf_model_params})

try:
elements = part_func(**part_inp)
mode = inp.mode
if mode == "partition":
isd = convert_to_isd(elements)
result = UnstructuredOutput(partitions=isd)
elif mode == "text":
text = save_to_txt(elements)
result = UnstructuredOutput(text=text)
elif mode == "vis":
html_text = visualize_html(elements)
result = UnstructuredOutput(html_text=html_text)

return result
except Exception as e:
return UnstructuredOutput(status_code=400, status_message=str(e))
21 changes: 21 additions & 0 deletions src/bisheng_unstructured/api/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel


class UnstructuredInput(BaseModel):
filename: str
url: Optional[str] = None
b64_data: Optional[List[str]] = None
parameters: Optional[Dict] = {}
mode: str = "text" # text, partition, vis
file_path: Optional[str] = None
file_type: Optional[str] = None


class UnstructuredOutput(BaseModel):
status_code: int = 200
status_message: str = "success"
text: Optional[str] = None
html_text: Optional[str] = None
partitions: List[Dict[str, Any]] = []
19 changes: 13 additions & 6 deletions src/bisheng_unstructured/documents/html_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from bisheng_unstructured.documents.markdown import transform_html_table_to_md


def visualize_html(elements, output_file):
def visualize_html(elements, output_file=None):
html_prefix = """
<html>
<head>
Expand Down Expand Up @@ -50,11 +50,15 @@ def visualize_html(elements, output_file):

body_content = "\n".join(texts)
html_str = html_prefix + body_content + html_suffix
with open(output_file, "w") as fout:
fout.write(html_str)

if output_file:
with open(output_file, "w") as fout:
fout.write(html_str)
else:
return html_str

def save_to_txt(elements, output_file):

def save_to_txt(elements, output_file=None):
text_elem_sep = "\n"
content_page = []
is_first_elem = True
Expand All @@ -77,5 +81,8 @@ def save_to_txt(elements, output_file):

last_label = label

with open(output_file, "w") as fout:
fout.write("".join(content_page))
if output_file:
with open(output_file, "w") as fout:
fout.write("".join(content_page))
else:
return "".join(content_page)
46 changes: 46 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import base64
import os

import requests


def test1():
url = "http://192.168.106.12:10001/v1/etl4llm/predict"
filename = "examples/docs/maoxuan_sample1.jpg"
b64_data = base64.b64encode(open(filename, "rb").read()).decode()
inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="text")
resp = requests.post(url, json=inp).json()
print(resp)


def test2():
url = "http://192.168.106.12:10001/v1/etl4llm/predict"
filename = "./examples/docs/毛泽东课件.pptx"
b64_data = base64.b64encode(open(filename, "rb").read()).decode()
inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="text")
resp = requests.post(url, json=inp).json()
print(resp)


def test3():
url = "http://192.168.106.12:10001/v1/etl4llm/predict"
filename = "./examples/docs/毛泽东课件.pptx"
b64_data = base64.b64encode(open(filename, "rb").read()).decode()
inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="partition")
resp = requests.post(url, json=inp).json()
print(resp)


def test4():
url = "http://192.168.106.12:10001/v1/etl4llm/predict"
filename = "./examples/docs/毛泽东课件.pptx"
b64_data = base64.b64encode(open(filename, "rb").read()).decode()
inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="vis")
resp = requests.post(url, json=inp).json()
print(resp)


# test1()
# test2()
# test3()
test4()
96 changes: 46 additions & 50 deletions tests/test_image.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,54 @@
from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html
from bisheng_unstructured.documents.pdf_parser.image import ImageDocument
from bisheng_unstructured.documents.html_utils import visualize_html, save_to_txt

TEST_RT_URL = 'http://192.168.106.12:9001/v2.1/models/'
TEST_RT_URL = "http://192.168.106.12:9001/v2.1/models/"


def test_image():
url = TEST_RT_URL
layout_ep = url + 'elem_layout_v1/infer'
cell_model_ep = url + 'elem_table_cell_detect_v1/infer'
rowcol_model_ep = url + 'elem_table_rowcol_detect_v1/infer'
table_model_ep = url + 'elem_table_detect_v1/infer'

model_params = {
'layout_ep': layout_ep,
'cell_model_ep': cell_model_ep,
'rowcol_model_ep': rowcol_model_ep,
'table_model_ep': table_model_ep,
}

filename = "examples/docs/maoxuan_intro_with_table.jpg"
doc = ImageDocument(
file=filename,
model_params=model_params)
pages = doc.pages
elements = doc.elements

visualize_html(elements, 'data/maoxuan_intro_with_table.html')
save_to_txt(elements, 'data/maoxuan_intro_with_table.txt')
url = TEST_RT_URL
layout_ep = url + "elem_layout_v1/infer"
cell_model_ep = url + "elem_table_cell_detect_v1/infer"
rowcol_model_ep = url + "elem_table_rowcol_detect_v1/infer"
table_model_ep = url + "elem_table_detect_v1/infer"

model_params = {
"layout_ep": layout_ep,
"cell_model_ep": cell_model_ep,
"rowcol_model_ep": rowcol_model_ep,
"table_model_ep": table_model_ep,
}

filename = "examples/docs/maoxuan_intro_with_table.jpg"
doc = ImageDocument(file=filename, model_params=model_params)
pages = doc.pages
elements = doc.elements

visualize_html(elements, "data/maoxuan_intro_with_table.html")
save_to_txt(elements, "data/maoxuan_intro_with_table.txt")


def test_image2():
url = TEST_RT_URL
layout_ep = url + 'elem_layout_v1/infer'
cell_model_ep = url + 'elem_table_cell_detect_v1/infer'
rowcol_model_ep = url + 'elem_table_rowcol_detect_v1/infer'
table_model_ep = url + 'elem_table_detect_v1/infer'

model_params = {
'layout_ep': layout_ep,
'cell_model_ep': cell_model_ep,
'rowcol_model_ep': rowcol_model_ep,
'table_model_ep': table_model_ep,
}

filename = "examples/docs/maoxuan_sample1.jpg"
doc = ImageDocument(
file=filename,
model_params=model_params)
pages = doc.pages
elements = doc.elements

visualize_html(elements, 'data/maoxuan_sample1.html')
save_to_txt(elements, 'data/maoxuan_sample1.txt')


test_image2()
test_image()
url = TEST_RT_URL
layout_ep = url + "elem_layout_v1/infer"
cell_model_ep = url + "elem_table_cell_detect_v1/infer"
rowcol_model_ep = url + "elem_table_rowcol_detect_v1/infer"
table_model_ep = url + "elem_table_detect_v1/infer"

model_params = {
"layout_ep": layout_ep,
"cell_model_ep": cell_model_ep,
"rowcol_model_ep": rowcol_model_ep,
"table_model_ep": table_model_ep,
}

filename = "examples/docs/maoxuan_sample1.jpg"
doc = ImageDocument(file=filename, model_params=model_params)
pages = doc.pages
elements = doc.elements

visualize_html(elements, "data/maoxuan_sample1.html")
save_to_txt(elements, "data/maoxuan_sample1.txt")


# test_image2()
test_image()
Loading

0 comments on commit ecb07cc

Please sign in to comment.