From 4ea716837dd08a68e9cfe04b217e6cc8bc611f0b Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructured.io>
Date: Fri, 16 Jun 2023 10:10:56 -0400
Subject: [PATCH] feat: add ability to extract extra metadata with regex (#763)

* first pass on regex metadata

* fix typing for regex metadata

* add dataclass back in

* add decorators

* fix tests

* update docs

* add tests for regex metadata

* add process metadata to tsv

* changelog and version

* docs typos

* consolidate to using a single kwarg

* fix test
---
 CHANGELOG.md                              |  1 +
 docs/source/index.rst                     |  6 +-
 docs/source/metadata.rst                  | 84 +++++++++++++++++++++++
 test_unstructured/partition/test_email.py | 21 +++---
 test_unstructured/partition/test_msg.py   | 21 +++---
 test_unstructured/partition/test_text.py  |  9 +++
 unstructured/documents/elements.py        | 76 ++++++++++++++++++--
 unstructured/partition/csv.py             |  9 ++-
 unstructured/partition/doc.py             |  4 +-
 unstructured/partition/docx.py            |  3 +
 unstructured/partition/email.py           |  3 +
 unstructured/partition/epub.py            |  4 +-
 unstructured/partition/html.py            |  4 +-
 unstructured/partition/image.py           |  4 +-
 unstructured/partition/json.py            |  4 +-
 unstructured/partition/md.py              |  4 +-
 unstructured/partition/msg.py             |  4 +-
 unstructured/partition/odt.py             |  9 ++-
 unstructured/partition/pdf.py             |  9 ++-
 unstructured/partition/ppt.py             |  4 +-
 unstructured/partition/pptx.py            |  3 +
 unstructured/partition/rst.py             |  4 +-
 unstructured/partition/rtf.py             |  4 +-
 unstructured/partition/text.py            |  3 +
 unstructured/partition/tsv.py             |  9 ++-
 unstructured/partition/xlsx.py            |  9 ++-
 unstructured/partition/xml.py             |  7 +-
 27 files changed, 281 insertions(+), 41 deletions(-)
 create mode 100644 docs/source/metadata.rst

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5653fd1a05..4582712013 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@
 
 ### Features
 
+* Provides users with the ability to extract additional metadata via regex.
 * Updates `partition_docx` to include headers and footers in the output.
 * Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`.
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 05225b0321..18a2fe97a3 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -15,7 +15,10 @@ Library Documentation
   Check out this section to learn about basic workflows in ``unstructured``.
 
 :doc:`bricks`
-  Learning more about partitioning, cleaning, and staging bricks, included advanced usage patterns.
+  Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.
+
+:doc:`metadata`
+  Learn more about how metadata is tracked in the ``unstructured`` library.
 
 :doc:`examples`
   Examples of other types of workflows within the ``unstructured`` package.
@@ -33,5 +36,6 @@ Library Documentation
    installing
    getting_started
    bricks
+   metadata
    examples
    integrations
diff --git a/docs/source/metadata.rst b/docs/source/metadata.rst
new file mode 100644
index 0000000000..d7496c8841
--- /dev/null
+++ b/docs/source/metadata.rst
@@ -0,0 +1,84 @@
+Metadata
+========
+
+The ``unstructured`` package tracks a variety of metadata about Elements extracted from documents.
+Tracking metadata enables users to filter document elements downstream based on element metadata of interest.
+For example, a user may be interested in selected document elements from a given page number
+or an e-mail with a given subject line.
+
+Metadata is tracked at the element level. You can extract the metadata for a given document element
+with ``element.metadata``. For a dictionary representation, use ``element.metadata.to_dict()``.
+All document types return the following metadata fields when the information is available from
+the source file:
+
+* ``filename``
+* ``file_directory``
+* ``date``
+* ``filetype``
+* ``page_number``
+
+
+Email
+-----
+
+Emails will include ``sent_from``, ``sent_to``, and ``subject`` metadata.
+``sent_from`` is a list of strings because the `RFC 822 <https://www.rfc-editor.org/rfc/rfc822>`_
+spec for emails allows for multiple sent from email addresses.
+
+
+Microsoft Excel Documents
+--------------------------
+
+For Excel documents, ``ElementMetadata`` will contain a ``page_name`` element, which corresponds
+to the sheet name in the Excel document.
+
+
+Microsoft Word Documents
+-------------------------
+
+Headers and footers in Word documents include a ``header_footer_type`` indicating which page
+a header or footer applies to. Valid values are ``"primary"``, ``"even_only"``, and ``"first_page"``.
+
+
+Webpages
+---------
+
+Elements from webpages will include a ``url`` metadata field, corresponding to the URL for the webpage.
+
+
+
+##########################
+Advanced Metadata Options
+###########################
+
+
+
+Extract Metadata with Regexes
+------------------------------
+
+``unstructured`` allows users to extract additional metadata with regexes using the ``regex_metadata`` kwarg.
+Here is an example of how to extract regex metadata:
+
+
+.. code:: python
+
+  from unstructured.partition.text import partition_text
+
+  text = "SPEAKER 1: It is my turn to speak now!"
+  elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}:"})
+  elements[0].metadata.regex_metadata
+
+The result will look like:
+
+
+.. code:: python
+
+  {'speaker':
+    [
+      {
+        'text': 'SPEAKER 1:',
+        'start': 0,
+        'end': 10,
+     }
+    ]
+  }
diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py
index c05f51b688..f0157639be 100644
--- a/test_unstructured/partition/test_email.py
+++ b/test_unstructured/partition/test_email.py
@@ -206,15 +206,18 @@ def test_partition_email_has_metadata():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
     elements = partition_email(filename=filename)
     assert len(elements) > 0
-    assert elements[0].metadata == ElementMetadata(
-        filename=filename,
-        date="2022-12-16T17:04:16-05:00",
-        page_number=None,
-        url=None,
-        sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
-        sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
-        subject="Test Email",
-        filetype="message/rfc822",
+    assert (
+        elements[0].metadata.to_dict()
+        == ElementMetadata(
+            filename=filename,
+            date="2022-12-16T17:04:16-05:00",
+            page_number=None,
+            url=None,
+            sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
+            sent_to=["Matthew Robinson <mrobinson@unstructured.io>"],
+            subject="Test Email",
+            filetype="message/rfc822",
+        ).to_dict()
     )
 
     expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py
index ddfdd299ae..875ccdd88f 100644
--- a/test_unstructured/partition/test_msg.py
+++ b/test_unstructured/partition/test_msg.py
@@ -36,15 +36,18 @@ def test_partition_msg_from_filename():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
     elements = partition_msg(filename=filename)
     assert elements == EXPECTED_MSG_OUTPUT
-    assert elements[0].metadata == ElementMetadata(
-        filename=filename,
-        date="2022-12-16T17:04:16-05:00",
-        page_number=None,
-        url=None,
-        sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
-        sent_to=["Matthew Robinson (None)"],
-        subject="Test Email",
-        filetype="application/vnd.ms-outlook",
+    assert (
+        elements[0].metadata.to_dict()
+        == ElementMetadata(
+            filename=filename,
+            date="2022-12-16T17:04:16-05:00",
+            page_number=None,
+            url=None,
+            sent_from=["Matthew Robinson <mrobinson@unstructured.io>"],
+            sent_to=["Matthew Robinson (None)"],
+            subject="Test Email",
+            filetype="application/vnd.ms-outlook",
+        ).to_dict()
     )
 
 
diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
index da09cb3f36..f41f58b82c 100644
--- a/test_unstructured/partition/test_text.py
+++ b/test_unstructured/partition/test_text.py
@@ -145,3 +145,12 @@ def test_partition_text_groups_broken_paragraphs():
         NarrativeText(text="The big brown fox was walking down the lane."),
         NarrativeText(text="At the end of the lane, the fox met a bear."),
     ]
+
+
+def test_partition_text_extract_regex_metadata():
+    text = "SPEAKER 1: It is my turn to speak now!"
+
+    elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
+    assert elements[0].metadata.regex_metadata == {
+        "speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
+    }
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
index b273f05501..0f68b7cda6 100644
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@@ -2,11 +2,14 @@
 
 import datetime
 import hashlib
+import inspect
 import os
 import pathlib
+import re
 from abc import ABC
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+from functools import wraps
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast
 
 
 class NoID(ABC):
@@ -30,6 +33,14 @@ def to_dict(self):
         return {key: value for key, value in self.__dict__.items() if value is not None}
 
 
+class RegexMetadata(TypedDict):
+    """Metadata that is extracted from a document element via regex."""
+
+    text: str
+    start: int
+    end: int
+
+
 @dataclass
 class ElementMetadata:
     data_source: Optional[DataSourceMetadata] = None
@@ -58,6 +69,9 @@ class ElementMetadata:
     # Text format metadata fields
     text_as_html: Optional[str] = None
 
+    # Metadata extracted via regex
+    regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None
+
     def __post_init__(self):
         if isinstance(self.filename, pathlib.Path):
             self.filename = str(self.filename)
@@ -68,10 +82,12 @@ def __post_init__(self):
             self.filename = filename
 
     def to_dict(self):
-        dict = {key: value for key, value in self.__dict__.items() if value is not None}
+        _dict = {key: value for key, value in self.__dict__.items() if value is not None}
+        if "regex_metadata" in _dict and not _dict["regex_metadata"]:
+            _dict.pop("regex_metadata")
         if self.data_source:
-            dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
-        return dict
+            _dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
+        return _dict
 
     @classmethod
     def from_dict(cls, input_dict):
@@ -91,6 +107,58 @@ def get_date(self) -> Optional[datetime.datetime]:
         return dt
 
 
+def process_metadata():
+    """Decorator for processing metadata for document elements."""
+
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            elements = func(*args, **kwargs)
+            sig = inspect.signature(func)
+            params = dict(**dict(zip(sig.parameters, args)), **kwargs)
+            for param in sig.parameters.values():
+                if param.name not in params and param.default is not param.empty:
+                    params[param.name] = param.default
+
+            regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
+            elements = _add_regex_metadata(elements, regex_metadata)
+
+            return elements
+
+        return wrapper
+
+    return decorator
+
+
+def _add_regex_metadata(
+    elements: List[Element],
+    regex_metadata: Dict[str, str] = {},
+) -> List[Element]:
+    """Adds metadata based on a user provided regular expression.
+    The additional metadata will be added to the regex_metadata
+    attrbuted in the element metadata."""
+    for element in elements:
+        if isinstance(element, Text):
+            _regex_metadata: Dict["str", List[RegexMetadata]] = {}
+            for field_name, pattern in regex_metadata.items():
+                results: List[RegexMetadata] = []
+                for result in re.finditer(pattern, element.text):
+                    start, end = result.span()
+                    results.append(
+                        {
+                            "text": element.text[start:end],
+                            "start": start,
+                            "end": end,
+                        },
+                    )
+                if len(results) > 0:
+                    _regex_metadata[field_name] = results
+
+            element.metadata.regex_metadata = _regex_metadata
+
+    return elements
+
+
 class Element(ABC):
     """An element is a section of a page in the document."""
 
diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
index 17a08e8cea..30ba225fe6 100644
--- a/unstructured/partition/csv.py
+++ b/unstructured/partition/csv.py
@@ -4,17 +4,24 @@
 import lxml.html
 import pandas as pd
 
-from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.documents.elements import (
+    Element,
+    ElementMetadata,
+    Table,
+    process_metadata,
+)
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.CSV)
 def partition_csv(
     filename: Optional[str] = None,
     file: Optional[Union[IO, SpooledTemporaryFile]] = None,
     metadata_filename: Optional[str] = None,
     include_metadata: bool = True,
+    **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft Excel Documents in .csv format into its document elements.
 
diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py
index 9979e20fa5..d62c48f85f 100644
--- a/unstructured/partition/doc.py
+++ b/unstructured/partition/doc.py
@@ -2,17 +2,19 @@
 import tempfile
 from typing import IO, List, Optional
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import convert_office_doc, exactly_one
 from unstructured.partition.docx import partition_docx
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.DOC)
 def partition_doc(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
     include_page_breaks: bool = True,
+    **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft Word Documents in .doc format into its document elements.
 
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
index 4a6cc02234..9d73e46757 100644
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@@ -22,6 +22,7 @@
     Table,
     Text,
     Title,
+    process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
@@ -102,12 +103,14 @@ def _get_runs(node, parent):
 Paragraph.runs = property(lambda self: _get_paragraph_runs(self))
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.DOCX)
 def partition_docx(
     filename: Optional[str] = None,
     file: Optional[Union[IO, SpooledTemporaryFile]] = None,
     metadata_filename: Optional[str] = None,
     include_page_breaks: bool = True,
+    **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft Word Documents in .docx format into its document elements.
 
diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
index 73c97405cf..bd746d8c7f 100644
--- a/unstructured/partition/email.py
+++ b/unstructured/partition/email.py
@@ -29,6 +29,7 @@
     NarrativeText,
     Text,
     Title,
+    process_metadata,
 )
 from unstructured.documents.email_elements import (
     MetaData,
@@ -182,6 +183,7 @@ def find_embedded_image(
     return Image(text=image_info[:-1]), element
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.EML)
 def partition_email(
     filename: Optional[str] = None,
@@ -190,6 +192,7 @@ def partition_email(
     content_source: str = "text/html",
     encoding: Optional[str] = None,
     include_headers: bool = False,
+    **kwargs,
 ) -> List[Element]:
     """Partitions an .eml documents into its constituent elements.
     Parameters
diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py
index 979c9cf118..c569bb9a49 100644
--- a/unstructured/partition/epub.py
+++ b/unstructured/partition/epub.py
@@ -1,15 +1,17 @@
 from typing import IO, List, Optional
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.html import convert_and_partition_html
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.EPUB)
 def partition_epub(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
     include_page_breaks: bool = False,
+    **kwargs,
 ) -> List[Element]:
     """Partitions an EPUB document. The document is first converted to HTML and then
     partitoned using partiton_html.
diff --git a/unstructured/partition/html.py b/unstructured/partition/html.py
index 8f50de27fe..ee80349d39 100644
--- a/unstructured/partition/html.py
+++ b/unstructured/partition/html.py
@@ -2,7 +2,7 @@
 
 import requests
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.documents.html import HTMLDocument
 from unstructured.documents.xml import VALID_PARSERS
 from unstructured.file_utils.encoding import read_txt_file
@@ -17,6 +17,7 @@
 )
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.HTML)
 def partition_html(
     filename: Optional[str] = None,
@@ -29,6 +30,7 @@ def partition_html(
     headers: Dict[str, str] = {},
     ssl_verify: bool = True,
     parser: VALID_PARSERS = None,
+    **kwargs,
 ) -> List[Element]:
     """Partitions an HTML document into its constituent elements.
 
diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py
index 12325241c9..e3572c7b6d 100644
--- a/unstructured/partition/image.py
+++ b/unstructured/partition/image.py
@@ -1,10 +1,11 @@
 from typing import List, Optional
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.partition.common import exactly_one
 from unstructured.partition.pdf import partition_pdf_or_image
 
 
+@process_metadata()
 def partition_image(
     filename: str = "",
     file: Optional[bytes] = None,
@@ -14,6 +15,7 @@ def partition_image(
     include_page_breaks: bool = False,
     ocr_languages: str = "eng",
     strategy: str = "auto",
+    **kwargs,
 ) -> List[Element]:
     """Parses an image into a list of interpreted elements.
 
diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py
index 08032d183e..817e0f0917 100644
--- a/unstructured/partition/json.py
+++ b/unstructured/partition/json.py
@@ -2,18 +2,20 @@
 import re
 from typing import IO, List, Optional
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
 from unstructured.partition.common import exactly_one
 from unstructured.staging.base import dict_to_elements
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.JSON)
 def partition_json(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
     text: Optional[str] = None,
+    **kwargs,
 ) -> List[Element]:
     """Partitions an .json document into its constituent elements."""
     if text is not None and text.strip() == "" and not file and not filename:
diff --git a/unstructured/partition/md.py b/unstructured/partition/md.py
index 435408151c..a608485cc2 100644
--- a/unstructured/partition/md.py
+++ b/unstructured/partition/md.py
@@ -3,7 +3,7 @@
 import markdown
 import requests
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.documents.xml import VALID_PARSERS
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one
@@ -16,6 +16,7 @@ def optional_decode(contents: Union[str, bytes]) -> str:
     return contents
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.MD)
 def partition_md(
     filename: Optional[str] = None,
@@ -25,6 +26,7 @@ def partition_md(
     include_page_breaks: bool = False,
     include_metadata: bool = True,
     parser: VALID_PARSERS = None,
+    **kwargs,
 ) -> List[Element]:
     # Verify that only one of the arguments was provided
     if text is None:
diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py
index 0749001571..60ae3b75a0 100644
--- a/unstructured/partition/msg.py
+++ b/unstructured/partition/msg.py
@@ -3,7 +3,7 @@
 
 import msg_parser
 
-from unstructured.documents.elements import Element, ElementMetadata
+from unstructured.documents.elements import Element, ElementMetadata, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one
 from unstructured.partition.email import convert_to_iso_8601
@@ -11,10 +11,12 @@
 from unstructured.partition.text import partition_text
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.MSG)
 def partition_msg(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
+    **kwargs,
 ) -> List[Element]:
     """Partitions a MSFT Outlook .msg file
 
diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py
index 9f8a4f1469..0009c1d610 100644
--- a/unstructured/partition/odt.py
+++ b/unstructured/partition/odt.py
@@ -1,12 +1,17 @@
 from typing import IO, List, Optional
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.docx import convert_and_partition_docx
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.ODT)
-def partition_odt(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
+def partition_odt(
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+    **kwargs,
+) -> List[Element]:
     """Partitions Open Office Documents in .odt format into its document elements.
 
     Parameters
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 8de6633a8d..4524df2e58 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -9,7 +9,12 @@
 from PIL import Image
 
 from unstructured.cleaners.core import clean_extra_whitespace
-from unstructured.documents.elements import Element, ElementMetadata, PageBreak
+from unstructured.documents.elements import (
+    Element,
+    ElementMetadata,
+    PageBreak,
+    process_metadata,
+)
 from unstructured.file_utils.filetype import (
     FileType,
     add_metadata_with_filetype,
@@ -26,6 +31,7 @@
 from unstructured.utils import requires_dependencies
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.PDF)
 def partition_pdf(
     filename: str = "",
@@ -37,6 +43,7 @@ def partition_pdf(
     strategy: str = "auto",
     infer_table_structure: bool = False,
     ocr_languages: str = "eng",
+    **kwargs,
 ) -> List[Element]:
     """Parses a pdf document into a list of interpreted elements.
     Parameters
diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py
index 1f1ac871e3..05c713e53c 100644
--- a/unstructured/partition/ppt.py
+++ b/unstructured/partition/ppt.py
@@ -2,17 +2,19 @@
 import tempfile
 from typing import IO, List, Optional
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import convert_office_doc, exactly_one
 from unstructured.partition.pptx import partition_pptx
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.PPT)
 def partition_ppt(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
     include_page_breaks: bool = False,
+    **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
 
diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py
index aaa781ef6a..58b35bddfd 100644
--- a/unstructured/partition/pptx.py
+++ b/unstructured/partition/pptx.py
@@ -12,6 +12,7 @@
     Table,
     Text,
     Title,
+    process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
@@ -27,12 +28,14 @@
 OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.PPTX)
 def partition_pptx(
     filename: Optional[str] = None,
     file: Optional[Union[IO, SpooledTemporaryFile]] = None,
     include_page_breaks: bool = True,
     metadata_filename: Optional[str] = None,
+    **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft PowerPoint Documents in .pptx format into its document elements.
 
diff --git a/unstructured/partition/rst.py b/unstructured/partition/rst.py
index 3b33368af1..17f7a97d05 100644
--- a/unstructured/partition/rst.py
+++ b/unstructured/partition/rst.py
@@ -1,15 +1,17 @@
 from typing import IO, List, Optional
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.html import convert_and_partition_html
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.RST)
 def partition_rst(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
     include_page_breaks: bool = False,
+    **kwargs,
 ) -> List[Element]:
     """Partitions an RST document. The document is first converted to HTML and then
     partitioned using partition_html.
diff --git a/unstructured/partition/rtf.py b/unstructured/partition/rtf.py
index 5a14734008..dde8ce342f 100644
--- a/unstructured/partition/rtf.py
+++ b/unstructured/partition/rtf.py
@@ -1,15 +1,17 @@
 from typing import IO, List, Optional
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.html import convert_and_partition_html
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.RTF)
 def partition_rtf(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
     include_page_breaks: bool = False,
+    **kwargs,
 ) -> List[Element]:
     """Partitions an RTF document. The document is first converted to HTML and then
     partitioned using partiton_html.
diff --git a/unstructured/partition/text.py b/unstructured/partition/text.py
index 19204a2fa4..1fe1162306 100644
--- a/unstructured/partition/text.py
+++ b/unstructured/partition/text.py
@@ -10,6 +10,7 @@
     NarrativeText,
     Text,
     Title,
+    process_metadata,
 )
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
@@ -27,6 +28,7 @@ def split_by_paragraph(content: str) -> List[str]:
     return re.split(PARAGRAPH_PATTERN, content)
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.TXT)
 def partition_text(
     filename: Optional[str] = None,
@@ -36,6 +38,7 @@ def partition_text(
     paragraph_grouper: Optional[Callable[[str], str]] = None,
     metadata_filename: Optional[str] = None,
     include_metadata: bool = True,
+    **kwargs,
 ) -> List[Element]:
     """Partitions an .txt documents into its constituent elements.
     Parameters
diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py
index 5c4441222b..a063c3a28d 100644
--- a/unstructured/partition/tsv.py
+++ b/unstructured/partition/tsv.py
@@ -4,17 +4,24 @@
 import lxml.html
 import pandas as pd
 
-from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.documents.elements import (
+    Element,
+    ElementMetadata,
+    Table,
+    process_metadata,
+)
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.TSV)
 def partition_tsv(
     filename: Optional[str] = None,
     file: Optional[Union[IO, SpooledTemporaryFile]] = None,
     metadata_filename: Optional[str] = None,
     include_metadata: bool = True,
+    **kwargs,
 ) -> List[Element]:
     """Partitions TSV files into document elements.
 
diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
index d4db0fa73e..84c0a63b9e 100644
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@@ -4,17 +4,24 @@
 import lxml.html
 import pandas as pd
 
-from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.documents.elements import (
+    Element,
+    ElementMetadata,
+    Table,
+    process_metadata,
+)
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.XLSX)
 def partition_xlsx(
     filename: Optional[str] = None,
     file: Optional[Union[IO, SpooledTemporaryFile]] = None,
     metadata_filename: Optional[str] = None,
     include_metadata: bool = True,
+    **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft Excel Documents in .xlsx format into its document elements.
 
diff --git a/unstructured/partition/xml.py b/unstructured/partition/xml.py
index 37166ced63..f5333e7832 100644
--- a/unstructured/partition/xml.py
+++ b/unstructured/partition/xml.py
@@ -1,7 +1,8 @@
 import xml.etree.ElementTree as ET
 from tempfile import SpooledTemporaryFile
-from typing import IO, BinaryIO, Optional, Union, cast
+from typing import IO, BinaryIO, List, Optional, Union, cast
 
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.encoding import read_txt_file
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
@@ -38,6 +39,7 @@ def get_leaf_elements(
     return "\n".join(leaf_elements)  # type: ignore
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.XML)
 def partition_xml(
     filename: Optional[str] = None,
@@ -47,7 +49,8 @@ def partition_xml(
     metadata_filename: Optional[str] = None,
     include_metadata: bool = True,
     encoding: Optional[str] = None,
-):
+    **kwargs,
+) -> List[Element]:
     """Partitions an XML document into its document elements.
 
     Parameters