-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
335 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{ | ||
"pdf_model_params": { | ||
"layout_ep": "http://192.168.106.12:9001/v2.1/models/elem_layout_v1/infer", | ||
"cell_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_cell_detect_v1/infer", | ||
"rowcol_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_rowcol_detect_v1/infer", | ||
"table_model_ep": "http://192.168.106.12:9001/v2.1/models/elem_table_detect_v1/infer", | ||
"ocr_model_ep": "" | ||
} | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import base64 | ||
import json | ||
import os | ||
import tempfile | ||
|
||
import requests | ||
from fastapi import Depends, FastAPI, Header, HTTPException, Request, status | ||
from fastapi.middleware.cors import CORSMiddleware | ||
from fastapi.responses import ORJSONResponse | ||
|
||
from .pipeline import Pipeline | ||
from .types import UnstructuredInput, UnstructuredOutput | ||
|
||
# Fastapi App | ||
|
||
|
||
def handle_http_exception(req: Request, exc: HTTPException) -> ORJSONResponse: | ||
msg = {"status_code": exc.status_code, "status_message": exc.detail} | ||
return ORJSONResponse(content=msg) | ||
|
||
|
||
_EXCEPTION_HANDLERS = {HTTPException: handle_http_exception} | ||
|
||
|
||
def create_app(): | ||
"""Create the FastAPI app and include the router.""" | ||
|
||
app = FastAPI( | ||
default_response_class=ORJSONResponse, | ||
exception_handlers=_EXCEPTION_HANDLERS, | ||
) | ||
|
||
origins = [ | ||
"*", | ||
] | ||
|
||
@app.get("/health") | ||
def get_health(): | ||
return {"status": "OK"} | ||
|
||
app.add_middleware( | ||
CORSMiddleware, | ||
allow_origins=origins, | ||
allow_credentials=True, | ||
allow_methods=["*"], | ||
allow_headers=["*"], | ||
) | ||
|
||
return app | ||
|
||
|
||
app = create_app() | ||
|
||
config_file = "./config/config.json" | ||
pipeline = Pipeline(config_file) | ||
|
||
|
||
@app.post("/v1/etl4llm/predict", response_model=UnstructuredOutput) | ||
async def etl4_llm(inp: UnstructuredInput): | ||
filename = inp.filename | ||
b64_data = inp.b64_data | ||
file_type = filename.rsplit(".", 1)[1].lower() | ||
|
||
if not inp.b64_data and not inp.url: | ||
raise Exception("url or b64_data at least one must be given") | ||
|
||
with tempfile.TemporaryDirectory() as tmpdir: | ||
file_path = os.path.join(tmpdir, filename) | ||
if b64_data: | ||
with open(file_path, "wb") as fout: | ||
fout.write(base64.b64decode(b64_data[0])) | ||
else: | ||
headers = inp.parameters.get("headers", {}) | ||
ssl_verify = inp.parameters.get("ssl_verify", True) | ||
response = requests.get(inp.url, headers=headers, verify=ssl_verify) | ||
if not response.ok: | ||
raise Exception(f"URL return an error: {response.status_code}") | ||
with open(file_path, "wb") as fout: | ||
fout.write(response.text) | ||
|
||
inp.file_path = file_path | ||
inp.file_type = file_type | ||
|
||
return pipeline.predict(inp) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import json | ||
from typing import Dict | ||
|
||
from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html | ||
from bisheng_unstructured.documents.pdf_parser.image import ImageDocument | ||
from bisheng_unstructured.documents.pdf_parser.pdf import PDFDocument | ||
from bisheng_unstructured.partition.doc import partition_doc | ||
from bisheng_unstructured.partition.docx import partition_docx | ||
from bisheng_unstructured.partition.html import partition_html | ||
from bisheng_unstructured.partition.md import partition_md | ||
from bisheng_unstructured.partition.ppt import partition_ppt | ||
from bisheng_unstructured.partition.pptx import partition_pptx | ||
from bisheng_unstructured.partition.xlsx import partition_xlsx | ||
from bisheng_unstructured.staging.base import convert_to_isd | ||
|
||
from .types import UnstructuredInput, UnstructuredOutput | ||
|
||
|
||
def partition_pdf(filename, model_params, part_params={}, **kwargs): | ||
doc = PDFDocument(file=filename, model_params=model_params, **part_params, **kwargs) | ||
doc.pages | ||
return doc.elements | ||
|
||
|
||
def partition_image(filename, model_params, part_params={}, **kwargs): | ||
doc = ImageDocument(file=filename, model_params=model_params, **part_params, **kwargs) | ||
doc.pages | ||
return doc.elements | ||
|
||
|
||
PARTITION_MAP = { | ||
"pdf": partition_pdf, | ||
"png": partition_image, | ||
"jpeg": partition_image, | ||
"jpg": partition_image, | ||
"tiff": partition_image, | ||
"doc": partition_doc, | ||
"docx": partition_docx, | ||
"ppt": partition_ppt, | ||
"pptx": partition_pptx, | ||
"xlsx": partition_xlsx, | ||
"md": partition_md, | ||
"html": partition_html, | ||
} | ||
|
||
|
||
class Pipeline(object): | ||
def __init__(self, config_file: str): | ||
self.config = json.load(open(config_file)) | ||
self.pdf_model_params = self.config.get("pdf_model_params") | ||
|
||
def predict(self, inp: UnstructuredInput) -> UnstructuredOutput: | ||
if inp.file_type not in PARTITION_MAP: | ||
raise Exception(f"file type[{inp.file_type}] not supported") | ||
|
||
filename = inp.file_path | ||
file_type = inp.file_type | ||
part_params = inp.parameters | ||
part_inp = {"filename": filename, **inp.parameters} | ||
part_func = PARTITION_MAP.get(file_type) | ||
if part_func == partition_pdf or part_func == partition_image: | ||
part_inp.update({"model_params": self.pdf_model_params}) | ||
|
||
try: | ||
elements = part_func(**part_inp) | ||
mode = inp.mode | ||
if mode == "partition": | ||
isd = convert_to_isd(elements) | ||
result = UnstructuredOutput(partitions=isd) | ||
elif mode == "text": | ||
text = save_to_txt(elements) | ||
result = UnstructuredOutput(text=text) | ||
elif mode == "vis": | ||
html_text = visualize_html(elements) | ||
result = UnstructuredOutput(html_text=html_text) | ||
|
||
return result | ||
except Exception as e: | ||
return UnstructuredOutput(status_code=400, status_message=str(e)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
from pydantic import BaseModel | ||
|
||
|
||
class UnstructuredInput(BaseModel): | ||
filename: str | ||
url: Optional[str] = None | ||
b64_data: Optional[List[str]] = None | ||
parameters: Optional[Dict] = {} | ||
mode: str = "text" # text, partition, vis | ||
file_path: Optional[str] = None | ||
file_type: Optional[str] = None | ||
|
||
|
||
class UnstructuredOutput(BaseModel): | ||
status_code: int = 200 | ||
status_message: str = "success" | ||
text: Optional[str] = None | ||
html_text: Optional[str] = None | ||
partitions: List[Dict[str, Any]] = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import base64 | ||
import os | ||
|
||
import requests | ||
|
||
|
||
def test1(): | ||
url = "http://192.168.106.12:10001/v1/etl4llm/predict" | ||
filename = "examples/docs/maoxuan_sample1.jpg" | ||
b64_data = base64.b64encode(open(filename, "rb").read()).decode() | ||
inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="text") | ||
resp = requests.post(url, json=inp).json() | ||
print(resp) | ||
|
||
|
||
def test2(): | ||
url = "http://192.168.106.12:10001/v1/etl4llm/predict" | ||
filename = "./examples/docs/毛泽东课件.pptx" | ||
b64_data = base64.b64encode(open(filename, "rb").read()).decode() | ||
inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="text") | ||
resp = requests.post(url, json=inp).json() | ||
print(resp) | ||
|
||
|
||
def test3(): | ||
url = "http://192.168.106.12:10001/v1/etl4llm/predict" | ||
filename = "./examples/docs/毛泽东课件.pptx" | ||
b64_data = base64.b64encode(open(filename, "rb").read()).decode() | ||
inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="partition") | ||
resp = requests.post(url, json=inp).json() | ||
print(resp) | ||
|
||
|
||
def test4(): | ||
url = "http://192.168.106.12:10001/v1/etl4llm/predict" | ||
filename = "./examples/docs/毛泽东课件.pptx" | ||
b64_data = base64.b64encode(open(filename, "rb").read()).decode() | ||
inp = dict(filename=os.path.basename(filename), b64_data=[b64_data], mode="vis") | ||
resp = requests.post(url, json=inp).json() | ||
print(resp) | ||
|
||
|
||
# test1() | ||
# test2() | ||
# test3() | ||
test4() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,58 +1,54 @@ | ||
from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html | ||
from bisheng_unstructured.documents.pdf_parser.image import ImageDocument | ||
from bisheng_unstructured.documents.html_utils import visualize_html, save_to_txt | ||
|
||
TEST_RT_URL = 'http://192.168.106.12:9001/v2.1/models/' | ||
TEST_RT_URL = "http://192.168.106.12:9001/v2.1/models/" | ||
|
||
|
||
def test_image(): | ||
url = TEST_RT_URL | ||
layout_ep = url + 'elem_layout_v1/infer' | ||
cell_model_ep = url + 'elem_table_cell_detect_v1/infer' | ||
rowcol_model_ep = url + 'elem_table_rowcol_detect_v1/infer' | ||
table_model_ep = url + 'elem_table_detect_v1/infer' | ||
|
||
model_params = { | ||
'layout_ep': layout_ep, | ||
'cell_model_ep': cell_model_ep, | ||
'rowcol_model_ep': rowcol_model_ep, | ||
'table_model_ep': table_model_ep, | ||
} | ||
|
||
filename = "examples/docs/maoxuan_intro_with_table.jpg" | ||
doc = ImageDocument( | ||
file=filename, | ||
model_params=model_params) | ||
pages = doc.pages | ||
elements = doc.elements | ||
|
||
visualize_html(elements, 'data/maoxuan_intro_with_table.html') | ||
save_to_txt(elements, 'data/maoxuan_intro_with_table.txt') | ||
url = TEST_RT_URL | ||
layout_ep = url + "elem_layout_v1/infer" | ||
cell_model_ep = url + "elem_table_cell_detect_v1/infer" | ||
rowcol_model_ep = url + "elem_table_rowcol_detect_v1/infer" | ||
table_model_ep = url + "elem_table_detect_v1/infer" | ||
|
||
model_params = { | ||
"layout_ep": layout_ep, | ||
"cell_model_ep": cell_model_ep, | ||
"rowcol_model_ep": rowcol_model_ep, | ||
"table_model_ep": table_model_ep, | ||
} | ||
|
||
filename = "examples/docs/maoxuan_intro_with_table.jpg" | ||
doc = ImageDocument(file=filename, model_params=model_params) | ||
pages = doc.pages | ||
elements = doc.elements | ||
|
||
visualize_html(elements, "data/maoxuan_intro_with_table.html") | ||
save_to_txt(elements, "data/maoxuan_intro_with_table.txt") | ||
|
||
|
||
def test_image2(): | ||
url = TEST_RT_URL | ||
layout_ep = url + 'elem_layout_v1/infer' | ||
cell_model_ep = url + 'elem_table_cell_detect_v1/infer' | ||
rowcol_model_ep = url + 'elem_table_rowcol_detect_v1/infer' | ||
table_model_ep = url + 'elem_table_detect_v1/infer' | ||
|
||
model_params = { | ||
'layout_ep': layout_ep, | ||
'cell_model_ep': cell_model_ep, | ||
'rowcol_model_ep': rowcol_model_ep, | ||
'table_model_ep': table_model_ep, | ||
} | ||
|
||
filename = "examples/docs/maoxuan_sample1.jpg" | ||
doc = ImageDocument( | ||
file=filename, | ||
model_params=model_params) | ||
pages = doc.pages | ||
elements = doc.elements | ||
|
||
visualize_html(elements, 'data/maoxuan_sample1.html') | ||
save_to_txt(elements, 'data/maoxuan_sample1.txt') | ||
|
||
|
||
test_image2() | ||
test_image() | ||
url = TEST_RT_URL | ||
layout_ep = url + "elem_layout_v1/infer" | ||
cell_model_ep = url + "elem_table_cell_detect_v1/infer" | ||
rowcol_model_ep = url + "elem_table_rowcol_detect_v1/infer" | ||
table_model_ep = url + "elem_table_detect_v1/infer" | ||
|
||
model_params = { | ||
"layout_ep": layout_ep, | ||
"cell_model_ep": cell_model_ep, | ||
"rowcol_model_ep": rowcol_model_ep, | ||
"table_model_ep": table_model_ep, | ||
} | ||
|
||
filename = "examples/docs/maoxuan_sample1.jpg" | ||
doc = ImageDocument(file=filename, model_params=model_params) | ||
pages = doc.pages | ||
elements = doc.elements | ||
|
||
visualize_html(elements, "data/maoxuan_sample1.html") | ||
save_to_txt(elements, "data/maoxuan_sample1.txt") | ||
|
||
|
||
# test_image2() | ||
test_image() |
Oops, something went wrong.