Skip to content

Commit

Permalink
feat(workflow): Support JSON type in document extractor node (#9899)
Browse files Browse the repository at this point in the history
Co-authored-by: -LAN- <[email protected]>
  • Loading branch information
Theysua and laipz8200 authored Oct 26, 2024
1 parent dd3ac7a commit 216442d
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 33 deletions.
72 changes: 40 additions & 32 deletions api/core/workflow/nodes/document_extractor/node.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import csv
import io
import json

import docx
import pandas as pd
Expand Down Expand Up @@ -77,41 +78,40 @@ def _run(self):

def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
"""Extract text from a file based on its MIME type."""
if mime_type.startswith("text/plain") or mime_type in {"text/html", "text/htm", "text/markdown", "text/xml"}:
return _extract_text_from_plain_text(file_content)
elif mime_type == "application/pdf":
return _extract_text_from_pdf(file_content)
elif mime_type in {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword",
}:
return _extract_text_from_doc(file_content)
elif mime_type == "text/csv":
return _extract_text_from_csv(file_content)
elif mime_type in {
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.ms-excel",
}:
return _extract_text_from_excel(file_content)
elif mime_type == "application/vnd.ms-powerpoint":
return _extract_text_from_ppt(file_content)
elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
return _extract_text_from_pptx(file_content)
elif mime_type == "application/epub+zip":
return _extract_text_from_epub(file_content)
elif mime_type == "message/rfc822":
return _extract_text_from_eml(file_content)
elif mime_type == "application/vnd.ms-outlook":
return _extract_text_from_msg(file_content)
else:
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
match mime_type:
case "text/plain" | "text/html" | "text/htm" | "text/markdown" | "text/xml":
return _extract_text_from_plain_text(file_content)
case "application/pdf":
return _extract_text_from_pdf(file_content)
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | "application/msword":
return _extract_text_from_doc(file_content)
case "text/csv":
return _extract_text_from_csv(file_content)
case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel":
return _extract_text_from_excel(file_content)
case "application/vnd.ms-powerpoint":
return _extract_text_from_ppt(file_content)
case "application/vnd.openxmlformats-officedocument.presentationml.presentation":
return _extract_text_from_pptx(file_content)
case "application/epub+zip":
return _extract_text_from_epub(file_content)
case "message/rfc822":
return _extract_text_from_eml(file_content)
case "application/vnd.ms-outlook":
return _extract_text_from_msg(file_content)
case "application/json":
return _extract_text_from_json(file_content)
case _:
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")


def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) -> str:
"""Extract text from a file based on its file extension."""
match file_extension:
case ".txt" | ".markdown" | ".md" | ".html" | ".htm" | ".xml":
return _extract_text_from_plain_text(file_content)
case ".json":
return _extract_text_from_json(file_content)
case ".pdf":
return _extract_text_from_pdf(file_content)
case ".doc" | ".docx":
Expand Down Expand Up @@ -141,6 +141,14 @@ def _extract_text_from_plain_text(file_content: bytes) -> str:
raise TextExtractionError("Failed to decode plain text file") from e


def _extract_text_from_json(file_content: bytes) -> str:
try:
json_data = json.loads(file_content.decode("utf-8"))
return json.dumps(json_data, indent=2, ensure_ascii=False)
except (UnicodeDecodeError, json.JSONDecodeError) as e:
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e


def _extract_text_from_pdf(file_content: bytes) -> str:
try:
pdf_file = io.BytesIO(file_content)
Expand Down Expand Up @@ -183,13 +191,13 @@ def _download_file_content(file: File) -> bytes:


def _extract_text_from_file(file: File):
if file.mime_type is None:
raise UnsupportedFileTypeError("Unable to determine file type: MIME type is missing")
file_content = _download_file_content(file)
if file.transfer_method == FileTransferMethod.REMOTE_URL:
if file.extension:
extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension)
elif file.mime_type:
extracted_text = _extract_text_by_mime_type(file_content=file_content, mime_type=file.mime_type)
else:
extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension)
raise UnsupportedFileTypeError("Unable to determine file type: MIME type or file extension is missing")
return extracted_text


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def test_run_extract_text(
result = document_extractor_node._run()

assert isinstance(result, NodeRunResult)
assert result.status == WorkflowNodeExecutionStatus.SUCCEEDED
assert result.status == WorkflowNodeExecutionStatus.SUCCEEDED, result.error
assert result.outputs is not None
assert result.outputs["text"] == expected_text

Expand Down

0 comments on commit 216442d

Please sign in to comment.