From 4ea716837dd08a68e9cfe04b217e6cc8bc611f0b Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 16 Jun 2023 10:10:56 -0400 Subject: [PATCH] feat: add ability to extract extra metadata with regex (#763) * first pass on regex metadata * fix typing for regex metadata * add dataclass back in * add decorators * fix tests * update docs * add tests for regex metadata * add process metadata to tsv * changelog and version * docs typos * consolidate to using a single kwarg * fix test --- CHANGELOG.md | 1 + docs/source/index.rst | 6 +- docs/source/metadata.rst | 84 +++++++++++++++++++++++ test_unstructured/partition/test_email.py | 21 +++--- test_unstructured/partition/test_msg.py | 21 +++--- test_unstructured/partition/test_text.py | 9 +++ unstructured/documents/elements.py | 76 ++++++++++++++++++-- unstructured/partition/csv.py | 9 ++- unstructured/partition/doc.py | 4 +- unstructured/partition/docx.py | 3 + unstructured/partition/email.py | 3 + unstructured/partition/epub.py | 4 +- unstructured/partition/html.py | 4 +- unstructured/partition/image.py | 4 +- unstructured/partition/json.py | 4 +- unstructured/partition/md.py | 4 +- unstructured/partition/msg.py | 4 +- unstructured/partition/odt.py | 9 ++- unstructured/partition/pdf.py | 9 ++- unstructured/partition/ppt.py | 4 +- unstructured/partition/pptx.py | 3 + unstructured/partition/rst.py | 4 +- unstructured/partition/rtf.py | 4 +- unstructured/partition/text.py | 3 + unstructured/partition/tsv.py | 9 ++- unstructured/partition/xlsx.py | 9 ++- unstructured/partition/xml.py | 7 +- 27 files changed, 281 insertions(+), 41 deletions(-) create mode 100644 docs/source/metadata.rst diff --git a/CHANGELOG.md b/CHANGELOG.md index 5653fd1a05..4582712013 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ ### Features +* Provides users with the ability to extract additional metadata via regex. * Updates `partition_docx` to include headers and footers in the output. * Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`. diff --git a/docs/source/index.rst b/docs/source/index.rst index 05225b0321..18a2fe97a3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -15,7 +15,10 @@ Library Documentation Check out this section to learn about basic workflows in ``unstructured``. :doc:`bricks` - Learning more about partitioning, cleaning, and staging bricks, included advanced usage patterns. + Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns. + +:doc:`metadata` + Learn more about how metadata is tracked in the ``unstructured`` library. :doc:`examples` Examples of other types of workflows within the ``unstructured`` package. @@ -33,5 +36,6 @@ Library Documentation installing getting_started bricks + metadata examples integrations diff --git a/docs/source/metadata.rst b/docs/source/metadata.rst new file mode 100644 index 0000000000..d7496c8841 --- /dev/null +++ b/docs/source/metadata.rst @@ -0,0 +1,84 @@ +Metadata +======== + +The ``unstructured`` package tracks a variety of metadata about Elements extracted from documents. +Tracking metadata enables users to filter document elements downstream based on element metadata of interest. +For example, a user may be interested in selected document elements from a given page number +or an e-mail with a given subject line. + +Metadata is tracked at the element level. You can extract the metadata for a given document element +with ``element.metadata``. For a dictionary representation, use ``element.metadata.to_dict()``. +All document types return the following metadata fields when the information is available from +the source file: + +* ``filename`` +* ``file_directory`` +* ``date`` +* ``filetype`` +* ``page_number`` + + +Email +----- + +Emails will include ``sent_from``, ``sent_to``, and ``subject`` metadata. +``sent_from`` is a list of strings because the `RFC 822 `_ +spec for emails allows for multiple sent from email addresses. + + +Microsoft Excel Documents +-------------------------- + +For Excel documents, ``ElementMetadata`` will contain a ``page_name`` element, which corresponds +to the sheet name in the Excel document. + + +Microsoft Word Documents +------------------------- + +Headers and footers in Word documents include a ``header_footer_type`` indicating which page +a header or footer applies to. Valid values are ``"primary"``, ``"even_only"``, and ``"first_page"``. + + +Webpages +--------- + +Elements from webpages will include a ``url`` metadata field, corresponding to the URL for the webpage. + + + +########################## +Advanced Metadata Options +########################### + + + +Extract Metadata with Regexes +------------------------------ + +``unstructured`` allows users to extract additional metadata with regexes using the ``regex_metadata`` kwarg. +Here is an example of how to extract regex metadata: + + +.. code:: python + + from unstructured.partition.text import partition_text + + text = "SPEAKER 1: It is my turn to speak now!" + elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}:"}) + elements[0].metadata.regex_metadata + +The result will look like: + + +.. code:: python + + {'speaker': + [ + { + 'text': 'SPEAKER 1:', + 'start': 0, + 'end': 10, + } + ] + } diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index c05f51b688..f0157639be 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -206,15 +206,18 @@ def test_partition_email_has_metadata(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml") elements = partition_email(filename=filename) assert len(elements) > 0 - assert elements[0].metadata == ElementMetadata( - filename=filename, - date="2022-12-16T17:04:16-05:00", - page_number=None, - url=None, - sent_from=["Matthew Robinson "], - sent_to=["Matthew Robinson "], - subject="Test Email", - filetype="message/rfc822", + assert ( + elements[0].metadata.to_dict() + == ElementMetadata( + filename=filename, + date="2022-12-16T17:04:16-05:00", + page_number=None, + url=None, + sent_from=["Matthew Robinson "], + sent_to=["Matthew Robinson "], + subject="Test Email", + filetype="message/rfc822", + ).to_dict() ) expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00") diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index ddfdd299ae..875ccdd88f 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -36,15 +36,18 @@ def test_partition_msg_from_filename(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg") elements = partition_msg(filename=filename) assert elements == EXPECTED_MSG_OUTPUT - assert elements[0].metadata == ElementMetadata( - filename=filename, - date="2022-12-16T17:04:16-05:00", - page_number=None, - url=None, - sent_from=["Matthew Robinson "], - sent_to=["Matthew Robinson (None)"], - subject="Test Email", - filetype="application/vnd.ms-outlook", + assert ( + elements[0].metadata.to_dict() + == ElementMetadata( + filename=filename, + date="2022-12-16T17:04:16-05:00", + page_number=None, + url=None, + sent_from=["Matthew Robinson "], + sent_to=["Matthew Robinson (None)"], + subject="Test Email", + filetype="application/vnd.ms-outlook", + ).to_dict() ) diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py index da09cb3f36..f41f58b82c 100644 --- a/test_unstructured/partition/test_text.py +++ b/test_unstructured/partition/test_text.py @@ -145,3 +145,12 @@ def test_partition_text_groups_broken_paragraphs(): NarrativeText(text="The big brown fox was walking down the lane."), NarrativeText(text="At the end of the lane, the fox met a bear."), ] + + +def test_partition_text_extract_regex_metadata(): + text = "SPEAKER 1: It is my turn to speak now!" + + elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"}) + assert elements[0].metadata.regex_metadata == { + "speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}], + } diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index b273f05501..0f68b7cda6 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -2,11 +2,14 @@ import datetime import hashlib +import inspect import os import pathlib +import re from abc import ABC from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast +from functools import wraps +from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast class NoID(ABC): @@ -30,6 +33,14 @@ def to_dict(self): return {key: value for key, value in self.__dict__.items() if value is not None} +class RegexMetadata(TypedDict): + """Metadata that is extracted from a document element via regex.""" + + text: str + start: int + end: int + + @dataclass class ElementMetadata: data_source: Optional[DataSourceMetadata] = None @@ -58,6 +69,9 @@ class ElementMetadata: # Text format metadata fields text_as_html: Optional[str] = None + # Metadata extracted via regex + regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None + def __post_init__(self): if isinstance(self.filename, pathlib.Path): self.filename = str(self.filename) @@ -68,10 +82,12 @@ def __post_init__(self): self.filename = filename def to_dict(self): - dict = {key: value for key, value in self.__dict__.items() if value is not None} + _dict = {key: value for key, value in self.__dict__.items() if value is not None} + if "regex_metadata" in _dict and not _dict["regex_metadata"]: + _dict.pop("regex_metadata") if self.data_source: - dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict() - return dict + _dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict() + return _dict @classmethod def from_dict(cls, input_dict): @@ -91,6 +107,58 @@ def get_date(self) -> Optional[datetime.datetime]: return dt +def process_metadata(): + """Decorator for processing metadata for document elements.""" + + def decorator(func: Callable): + @wraps(func) + def wrapper(*args, **kwargs): + elements = func(*args, **kwargs) + sig = inspect.signature(func) + params = dict(**dict(zip(sig.parameters, args)), **kwargs) + for param in sig.parameters.values(): + if param.name not in params and param.default is not param.empty: + params[param.name] = param.default + + regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {}) + elements = _add_regex_metadata(elements, regex_metadata) + + return elements + + return wrapper + + return decorator + + +def _add_regex_metadata( + elements: List[Element], + regex_metadata: Dict[str, str] = {}, +) -> List[Element]: + """Adds metadata based on a user provided regular expression. + The additional metadata will be added to the regex_metadata + attrbuted in the element metadata.""" + for element in elements: + if isinstance(element, Text): + _regex_metadata: Dict["str", List[RegexMetadata]] = {} + for field_name, pattern in regex_metadata.items(): + results: List[RegexMetadata] = [] + for result in re.finditer(pattern, element.text): + start, end = result.span() + results.append( + { + "text": element.text[start:end], + "start": start, + "end": end, + }, + ) + if len(results) > 0: + _regex_metadata[field_name] = results + + element.metadata.regex_metadata = _regex_metadata + + return elements + + class Element(ABC): """An element is a section of a page in the document.""" diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 17a08e8cea..30ba225fe6 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -4,17 +4,24 @@ import lxml.html import pandas as pd -from unstructured.documents.elements import Element, ElementMetadata, Table +from unstructured.documents.elements import ( + Element, + ElementMetadata, + Table, + process_metadata, +) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed +@process_metadata() @add_metadata_with_filetype(FileType.CSV) def partition_csv( filename: Optional[str] = None, file: Optional[Union[IO, SpooledTemporaryFile]] = None, metadata_filename: Optional[str] = None, include_metadata: bool = True, + **kwargs, ) -> List[Element]: """Partitions Microsoft Excel Documents in .csv format into its document elements. diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py index 9979e20fa5..d62c48f85f 100644 --- a/unstructured/partition/doc.py +++ b/unstructured/partition/doc.py @@ -2,17 +2,19 @@ import tempfile from typing import IO, List, Optional -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import convert_office_doc, exactly_one from unstructured.partition.docx import partition_docx +@process_metadata() @add_metadata_with_filetype(FileType.DOC) def partition_doc( filename: Optional[str] = None, file: Optional[IO] = None, include_page_breaks: bool = True, + **kwargs, ) -> List[Element]: """Partitions Microsoft Word Documents in .doc format into its document elements. diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 4a6cc02234..9d73e46757 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -22,6 +22,7 @@ Table, Text, Title, + process_metadata, ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import ( @@ -102,12 +103,14 @@ def _get_runs(node, parent): Paragraph.runs = property(lambda self: _get_paragraph_runs(self)) +@process_metadata() @add_metadata_with_filetype(FileType.DOCX) def partition_docx( filename: Optional[str] = None, file: Optional[Union[IO, SpooledTemporaryFile]] = None, metadata_filename: Optional[str] = None, include_page_breaks: bool = True, + **kwargs, ) -> List[Element]: """Partitions Microsoft Word Documents in .docx format into its document elements. diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 73c97405cf..bd746d8c7f 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -29,6 +29,7 @@ NarrativeText, Text, Title, + process_metadata, ) from unstructured.documents.email_elements import ( MetaData, @@ -182,6 +183,7 @@ def find_embedded_image( return Image(text=image_info[:-1]), element +@process_metadata() @add_metadata_with_filetype(FileType.EML) def partition_email( filename: Optional[str] = None, @@ -190,6 +192,7 @@ def partition_email( content_source: str = "text/html", encoding: Optional[str] = None, include_headers: bool = False, + **kwargs, ) -> List[Element]: """Partitions an .eml documents into its constituent elements. Parameters diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py index 979c9cf118..c569bb9a49 100644 --- a/unstructured/partition/epub.py +++ b/unstructured/partition/epub.py @@ -1,15 +1,17 @@ from typing import IO, List, Optional -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.html import convert_and_partition_html +@process_metadata() @add_metadata_with_filetype(FileType.EPUB) def partition_epub( filename: Optional[str] = None, file: Optional[IO] = None, include_page_breaks: bool = False, + **kwargs, ) -> List[Element]: """Partitions an EPUB document. The document is first converted to HTML and then partitoned using partiton_html. diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py index 8f50de27fe..ee80349d39 100644 --- a/unstructured/partition/html.py +++ b/unstructured/partition/html.py @@ -2,7 +2,7 @@ import requests -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, process_metadata from unstructured.documents.html import HTMLDocument from unstructured.documents.xml import VALID_PARSERS from unstructured.file_utils.encoding import read_txt_file @@ -17,6 +17,7 @@ ) +@process_metadata() @add_metadata_with_filetype(FileType.HTML) def partition_html( filename: Optional[str] = None, @@ -29,6 +30,7 @@ def partition_html( headers: Dict[str, str] = {}, ssl_verify: bool = True, parser: VALID_PARSERS = None, + **kwargs, ) -> List[Element]: """Partitions an HTML document into its constituent elements. diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 12325241c9..e3572c7b6d 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -1,10 +1,11 @@ from typing import List, Optional -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, process_metadata from unstructured.partition.common import exactly_one from unstructured.partition.pdf import partition_pdf_or_image +@process_metadata() def partition_image( filename: str = "", file: Optional[bytes] = None, @@ -14,6 +15,7 @@ def partition_image( include_page_breaks: bool = False, ocr_languages: str = "eng", strategy: str = "auto", + **kwargs, ) -> List[Element]: """Parses an image into a list of interpreted elements. diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py index 08032d183e..817e0f0917 100644 --- a/unstructured/partition/json.py +++ b/unstructured/partition/json.py @@ -2,18 +2,20 @@ import re from typing import IO, List, Optional -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN from unstructured.partition.common import exactly_one from unstructured.staging.base import dict_to_elements +@process_metadata() @add_metadata_with_filetype(FileType.JSON) def partition_json( filename: Optional[str] = None, file: Optional[IO] = None, text: Optional[str] = None, + **kwargs, ) -> List[Element]: """Partitions an .json document into its constituent elements.""" if text is not None and text.strip() == "" and not file and not filename: diff --git a/unstructured/partition/md.py b/unstructured/partition/md.py index 435408151c..a608485cc2 100644 --- a/unstructured/partition/md.py +++ b/unstructured/partition/md.py @@ -3,7 +3,7 @@ import markdown import requests -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, process_metadata from unstructured.documents.xml import VALID_PARSERS from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import exactly_one @@ -16,6 +16,7 @@ def optional_decode(contents: Union[str, bytes]) -> str: return contents +@process_metadata() @add_metadata_with_filetype(FileType.MD) def partition_md( filename: Optional[str] = None, @@ -25,6 +26,7 @@ def partition_md( include_page_breaks: bool = False, include_metadata: bool = True, parser: VALID_PARSERS = None, + **kwargs, ) -> List[Element]: # Verify that only one of the arguments was provided if text is None: diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 0749001571..60ae3b75a0 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -3,7 +3,7 @@ import msg_parser -from unstructured.documents.elements import Element, ElementMetadata +from unstructured.documents.elements import Element, ElementMetadata, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import exactly_one from unstructured.partition.email import convert_to_iso_8601 @@ -11,10 +11,12 @@ from unstructured.partition.text import partition_text +@process_metadata() @add_metadata_with_filetype(FileType.MSG) def partition_msg( filename: Optional[str] = None, file: Optional[IO] = None, + **kwargs, ) -> List[Element]: """Partitions a MSFT Outlook .msg file diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py index 9f8a4f1469..0009c1d610 100644 --- a/unstructured/partition/odt.py +++ b/unstructured/partition/odt.py @@ -1,12 +1,17 @@ from typing import IO, List, Optional -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.docx import convert_and_partition_docx +@process_metadata() @add_metadata_with_filetype(FileType.ODT) -def partition_odt(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]: +def partition_odt( + filename: Optional[str] = None, + file: Optional[IO] = None, + **kwargs, +) -> List[Element]: """Partitions Open Office Documents in .odt format into its document elements. Parameters diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 8de6633a8d..4524df2e58 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -9,7 +9,12 @@ from PIL import Image from unstructured.cleaners.core import clean_extra_whitespace -from unstructured.documents.elements import Element, ElementMetadata, PageBreak +from unstructured.documents.elements import ( + Element, + ElementMetadata, + PageBreak, + process_metadata, +) from unstructured.file_utils.filetype import ( FileType, add_metadata_with_filetype, @@ -26,6 +31,7 @@ from unstructured.utils import requires_dependencies +@process_metadata() @add_metadata_with_filetype(FileType.PDF) def partition_pdf( filename: str = "", @@ -37,6 +43,7 @@ def partition_pdf( strategy: str = "auto", infer_table_structure: bool = False, ocr_languages: str = "eng", + **kwargs, ) -> List[Element]: """Parses a pdf document into a list of interpreted elements. Parameters diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py index 1f1ac871e3..05c713e53c 100644 --- a/unstructured/partition/ppt.py +++ b/unstructured/partition/ppt.py @@ -2,17 +2,19 @@ import tempfile from typing import IO, List, Optional -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import convert_office_doc, exactly_one from unstructured.partition.pptx import partition_pptx +@process_metadata() @add_metadata_with_filetype(FileType.PPT) def partition_ppt( filename: Optional[str] = None, file: Optional[IO] = None, include_page_breaks: bool = False, + **kwargs, ) -> List[Element]: """Partitions Microsoft PowerPoint Documents in .ppt format into their document elements. diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index aaa781ef6a..58b35bddfd 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -12,6 +12,7 @@ Table, Text, Title, + process_metadata, ) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import ( @@ -27,12 +28,14 @@ OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}" +@process_metadata() @add_metadata_with_filetype(FileType.PPTX) def partition_pptx( filename: Optional[str] = None, file: Optional[Union[IO, SpooledTemporaryFile]] = None, include_page_breaks: bool = True, metadata_filename: Optional[str] = None, + **kwargs, ) -> List[Element]: """Partitions Microsoft PowerPoint Documents in .pptx format into its document elements. diff --git a/unstructured/partition/rst.py b/unstructured/partition/rst.py index 3b33368af1..17f7a97d05 100644 --- a/unstructured/partition/rst.py +++ b/unstructured/partition/rst.py @@ -1,15 +1,17 @@ from typing import IO, List, Optional -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.html import convert_and_partition_html +@process_metadata() @add_metadata_with_filetype(FileType.RST) def partition_rst( filename: Optional[str] = None, file: Optional[IO] = None, include_page_breaks: bool = False, + **kwargs, ) -> List[Element]: """Partitions an RST document. The document is first converted to HTML and then partitioned using partition_html. diff --git a/unstructured/partition/rtf.py b/unstructured/partition/rtf.py index 5a14734008..dde8ce342f 100644 --- a/unstructured/partition/rtf.py +++ b/unstructured/partition/rtf.py @@ -1,15 +1,17 @@ from typing import IO, List, Optional -from unstructured.documents.elements import Element +from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.html import convert_and_partition_html +@process_metadata() @add_metadata_with_filetype(FileType.RTF) def partition_rtf( filename: Optional[str] = None, file: Optional[IO] = None, include_page_breaks: bool = False, + **kwargs, ) -> List[Element]: """Partitions an RTF document. The document is first converted to HTML and then partitioned using partiton_html. diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py index 19204a2fa4..1fe1162306 100644 --- a/unstructured/partition/text.py +++ b/unstructured/partition/text.py @@ -10,6 +10,7 @@ NarrativeText, Text, Title, + process_metadata, ) from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype @@ -27,6 +28,7 @@ def split_by_paragraph(content: str) -> List[str]: return re.split(PARAGRAPH_PATTERN, content) +@process_metadata() @add_metadata_with_filetype(FileType.TXT) def partition_text( filename: Optional[str] = None, @@ -36,6 +38,7 @@ def partition_text( paragraph_grouper: Optional[Callable[[str], str]] = None, metadata_filename: Optional[str] = None, include_metadata: bool = True, + **kwargs, ) -> List[Element]: """Partitions an .txt documents into its constituent elements. Parameters diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py index 5c4441222b..a063c3a28d 100644 --- a/unstructured/partition/tsv.py +++ b/unstructured/partition/tsv.py @@ -4,17 +4,24 @@ import lxml.html import pandas as pd -from unstructured.documents.elements import Element, ElementMetadata, Table +from unstructured.documents.elements import ( + Element, + ElementMetadata, + Table, + process_metadata, +) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed +@process_metadata() @add_metadata_with_filetype(FileType.TSV) def partition_tsv( filename: Optional[str] = None, file: Optional[Union[IO, SpooledTemporaryFile]] = None, metadata_filename: Optional[str] = None, include_metadata: bool = True, + **kwargs, ) -> List[Element]: """Partitions TSV files into document elements. diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index d4db0fa73e..84c0a63b9e 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -4,17 +4,24 @@ import lxml.html import pandas as pd -from unstructured.documents.elements import Element, ElementMetadata, Table +from unstructured.documents.elements import ( + Element, + ElementMetadata, + Table, + process_metadata, +) from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed +@process_metadata() @add_metadata_with_filetype(FileType.XLSX) def partition_xlsx( filename: Optional[str] = None, file: Optional[Union[IO, SpooledTemporaryFile]] = None, metadata_filename: Optional[str] = None, include_metadata: bool = True, + **kwargs, ) -> List[Element]: """Partitions Microsoft Excel Documents in .xlsx format into its document elements. diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py index 37166ced63..f5333e7832 100644 --- a/unstructured/partition/xml.py +++ b/unstructured/partition/xml.py @@ -1,7 +1,8 @@ import xml.etree.ElementTree as ET from tempfile import SpooledTemporaryFile -from typing import IO, BinaryIO, Optional, Union, cast +from typing import IO, BinaryIO, List, Optional, Union, cast +from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed @@ -38,6 +39,7 @@ def get_leaf_elements( return "\n".join(leaf_elements) # type: ignore +@process_metadata() @add_metadata_with_filetype(FileType.XML) def partition_xml( filename: Optional[str] = None, @@ -47,7 +49,8 @@ def partition_xml( metadata_filename: Optional[str] = None, include_metadata: bool = True, encoding: Optional[str] = None, -): + **kwargs, +) -> List[Element]: """Partitions an XML document into its document elements. Parameters