Skip to content

Commit

Permalink
feat: add YAML type in document extractor node
Browse files Browse the repository at this point in the history
  • Loading branch information
hwzhuhao committed Oct 29, 2024
1 parent c8ef922 commit 698ba5a
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 1 deletion.
15 changes: 14 additions & 1 deletion api/core/workflow/nodes/document_extractor/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import docx
import pandas as pd
import pypdfium2
import yaml
from unstructured.partition.email import partition_email
from unstructured.partition.epub import partition_epub
from unstructured.partition.msg import partition_msg
Expand All @@ -21,7 +22,7 @@
from models.workflow import WorkflowNodeExecutionStatus

from .entities import DocumentExtractorNodeData
from .exc import DocumentExtractorError, FileDownloadError, TextExtractionError, UnsupportedFileTypeError
from .error import DocumentExtractorError, FileDownloadError, TextExtractionError, UnsupportedFileTypeError


class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
Expand Down Expand Up @@ -101,6 +102,8 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
return _extract_text_from_msg(file_content)
case "application/json":
return _extract_text_from_json(file_content)
case "application/x-yaml" | "text/yaml":
return _extract_text_from_yaml(file_content)
case _:
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")

Expand All @@ -112,6 +115,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
return _extract_text_from_plain_text(file_content)
case ".json":
return _extract_text_from_json(file_content)
case ".yaml" | ".yml":
return _extract_text_from_yaml(file_content)
case ".pdf":
return _extract_text_from_pdf(file_content)
case ".doc" | ".docx":
Expand Down Expand Up @@ -149,6 +154,14 @@ def _extract_text_from_json(file_content: bytes) -> str:
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e


def _extract_text_from_yaml(file_content: bytes) -> str:
try:
yaml_data = yaml.safe_load_all(file_content.decode("utf-8"))
return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
except (UnicodeDecodeError, yaml.YAMLError) as e:
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e


def _extract_text_from_pdf(file_content: bytes) -> str:
try:
pdf_file = io.BytesIO(file_content)
Expand Down

0 comments on commit 698ba5a

Please sign in to comment.