feat: add ability to extract extra metadata with regex (#763)

* first pass on regex metadata * fix typing for regex metadata * add dataclass back in * add decorators * fix tests * update docs * add tests for regex metadata * add process metadata to tsv * changelog and version * docs typos * consolidate to using a single kwarg * fix test
Unstructured-IO · Jun 16, 2023 · 4ea7168 · 4ea7168
1 parent ec403e2
commit 4ea7168
Show file tree

Hide file tree

Showing 27 changed files with 281 additions and 41 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 
 ### Features
 
+* Provides users with the ability to extract additional metadata via regex.
 * Updates `partition_docx` to include headers and footers in the output.
 * Create `partition_tsv` and associated tests. Make additional changes to `detect_filetype`.
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -15,7 +15,10 @@ Library Documentation
   Check out this section to learn about basic workflows in ``unstructured``.
 
 :doc:`bricks`
-  Learning more about partitioning, cleaning, and staging bricks, included advanced usage patterns.
+  Learn more about partitioning, cleaning, and staging bricks, including advanced usage patterns.
+
+:doc:`metadata`
+  Learn more about how metadata is tracked in the ``unstructured`` library.
 
 :doc:`examples`
   Examples of other types of workflows within the ``unstructured`` package.
@@ -33,5 +36,6 @@ Library Documentation
    installing
    getting_started
    bricks
+   metadata
    examples
    integrations
diff --git a/docs/source/metadata.rst b/docs/source/metadata.rst
@@ -0,0 +1,84 @@
+Metadata
+========
+
+The ``unstructured`` package tracks a variety of metadata about Elements extracted from documents.
+Tracking metadata enables users to filter document elements downstream based on element metadata of interest.
+For example, a user may be interested in selected document elements from a given page number
+or an e-mail with a given subject line.
+
+Metadata is tracked at the element level. You can extract the metadata for a given document element
+with ``element.metadata``. For a dictionary representation, use ``element.metadata.to_dict()``.
+All document types return the following metadata fields when the information is available from
+the source file:
+
+* ``filename``
+* ``file_directory``
+* ``date``
+* ``filetype``
+* ``page_number``
+
+
+Email
+-----
+
+Emails will include ``sent_from``, ``sent_to``, and ``subject`` metadata.
+``sent_from`` is a list of strings because the `RFC 822 <https://www.rfc-editor.org/rfc/rfc822>`_
+spec for emails allows for multiple sent from email addresses.
+
+
+Microsoft Excel Documents
+--------------------------
+
+For Excel documents, ``ElementMetadata`` will contain a ``page_name`` element, which corresponds
+to the sheet name in the Excel document.
+
+
+Microsoft Word Documents
+-------------------------
+
+Headers and footers in Word documents include a ``header_footer_type`` indicating which page
+a header or footer applies to. Valid values are ``"primary"``, ``"even_only"``, and ``"first_page"``.
+
+
+Webpages
+---------
+
+Elements from webpages will include a ``url`` metadata field, corresponding to the URL for the webpage.
+
+
+
+##########################
+Advanced Metadata Options
+###########################
+
+
+
+Extract Metadata with Regexes
+------------------------------
+
+``unstructured`` allows users to extract additional metadata with regexes using the ``regex_metadata`` kwarg.
+Here is an example of how to extract regex metadata:
+
+
+.. code:: python
+
+  from unstructured.partition.text import partition_text
+
+  text = "SPEAKER 1: It is my turn to speak now!"
+  elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}:"})
+  elements[0].metadata.regex_metadata
+
+The result will look like:
+
+
+.. code:: python
+
+  {'speaker':
+    [
+      {
+        'text': 'SPEAKER 1:',
+        'start': 0,
+        'end': 10,
+     }
+    ]
+  }
diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py
@@ -206,15 +206,18 @@ def test_partition_email_has_metadata():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
     elements = partition_email(filename=filename)
     assert len(elements) > 0
-    assert elements[0].metadata == ElementMetadata(
-        filename=filename,
-        date="2022-12-16T17:04:16-05:00",
-        page_number=None,
-        url=None,
-        sent_from=["Matthew Robinson <[email protected]>"],
-        sent_to=["Matthew Robinson <[email protected]>"],
-        subject="Test Email",
-        filetype="message/rfc822",
+    assert (
+        elements[0].metadata.to_dict()
+        == ElementMetadata(
+            filename=filename,
+            date="2022-12-16T17:04:16-05:00",
+            page_number=None,
+            url=None,
+            sent_from=["Matthew Robinson <[email protected]>"],
+            sent_to=["Matthew Robinson <[email protected]>"],
+            subject="Test Email",
+            filetype="message/rfc822",
+        ).to_dict()
     )
 
     expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")

diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py
@@ -36,15 +36,18 @@ def test_partition_msg_from_filename():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.msg")
     elements = partition_msg(filename=filename)
     assert elements == EXPECTED_MSG_OUTPUT
-    assert elements[0].metadata == ElementMetadata(
-        filename=filename,
-        date="2022-12-16T17:04:16-05:00",
-        page_number=None,
-        url=None,
-        sent_from=["Matthew Robinson <[email protected]>"],
-        sent_to=["Matthew Robinson (None)"],
-        subject="Test Email",
-        filetype="application/vnd.ms-outlook",
+    assert (
+        elements[0].metadata.to_dict()
+        == ElementMetadata(
+            filename=filename,
+            date="2022-12-16T17:04:16-05:00",
+            page_number=None,
+            url=None,
+            sent_from=["Matthew Robinson <[email protected]>"],
+            sent_to=["Matthew Robinson (None)"],
+            subject="Test Email",
+            filetype="application/vnd.ms-outlook",
+        ).to_dict()
     )
 
 

diff --git a/test_unstructured/partition/test_text.py b/test_unstructured/partition/test_text.py
@@ -145,3 +145,12 @@ def test_partition_text_groups_broken_paragraphs():
         NarrativeText(text="The big brown fox was walking down the lane."),
         NarrativeText(text="At the end of the lane, the fox met a bear."),
     ]
+
+
+def test_partition_text_extract_regex_metadata():
+    text = "SPEAKER 1: It is my turn to speak now!"
+
+    elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
+    assert elements[0].metadata.regex_metadata == {
+        "speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
+    }
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
@@ -2,11 +2,14 @@
 
 import datetime
 import hashlib
+import inspect
 import os
 import pathlib
+import re
 from abc import ABC
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+from functools import wraps
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast
 
 
 class NoID(ABC):
@@ -30,6 +33,14 @@ def to_dict(self):
         return {key: value for key, value in self.__dict__.items() if value is not None}
 
 
+class RegexMetadata(TypedDict):
+    """Metadata that is extracted from a document element via regex."""
+
+    text: str
+    start: int
+    end: int
+
+
 @dataclass
 class ElementMetadata:
     data_source: Optional[DataSourceMetadata] = None
@@ -58,6 +69,9 @@ class ElementMetadata:
     # Text format metadata fields
     text_as_html: Optional[str] = None
 
+    # Metadata extracted via regex
+    regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None
+
     def __post_init__(self):
         if isinstance(self.filename, pathlib.Path):
             self.filename = str(self.filename)
@@ -68,10 +82,12 @@ def __post_init__(self):
             self.filename = filename
 
     def to_dict(self):
-        dict = {key: value for key, value in self.__dict__.items() if value is not None}
+        _dict = {key: value for key, value in self.__dict__.items() if value is not None}
+        if "regex_metadata" in _dict and not _dict["regex_metadata"]:
+            _dict.pop("regex_metadata")
         if self.data_source:
-            dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
-        return dict
+            _dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
+        return _dict
 
     @classmethod
     def from_dict(cls, input_dict):
@@ -91,6 +107,58 @@ def get_date(self) -> Optional[datetime.datetime]:
         return dt
 
 
+def process_metadata():
+    """Decorator for processing metadata for document elements."""
+
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            elements = func(*args, **kwargs)
+            sig = inspect.signature(func)
+            params = dict(**dict(zip(sig.parameters, args)), **kwargs)
+            for param in sig.parameters.values():
+                if param.name not in params and param.default is not param.empty:
+                    params[param.name] = param.default
+
+            regex_metadata: Dict["str", "str"] = params.get("regex_metadata", {})
+            elements = _add_regex_metadata(elements, regex_metadata)
+
+            return elements
+
+        return wrapper
+
+    return decorator
+
+
+def _add_regex_metadata(
+    elements: List[Element],
+    regex_metadata: Dict[str, str] = {},
+) -> List[Element]:
+    """Adds metadata based on a user provided regular expression.
+    The additional metadata will be added to the regex_metadata
+    attrbuted in the element metadata."""
+    for element in elements:
+        if isinstance(element, Text):
+            _regex_metadata: Dict["str", List[RegexMetadata]] = {}
+            for field_name, pattern in regex_metadata.items():
+                results: List[RegexMetadata] = []
+                for result in re.finditer(pattern, element.text):
+                    start, end = result.span()
+                    results.append(
+                        {
+                            "text": element.text[start:end],
+                            "start": start,
+                            "end": end,
+                        },
+                    )
+                if len(results) > 0:
+                    _regex_metadata[field_name] = results
+
+            element.metadata.regex_metadata = _regex_metadata
+
+    return elements
+
+
 class Element(ABC):
     """An element is a section of a page in the document."""
 

diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py
@@ -4,17 +4,24 @@
 import lxml.html
 import pandas as pd
 
-from unstructured.documents.elements import Element, ElementMetadata, Table
+from unstructured.documents.elements import (
+    Element,
+    ElementMetadata,
+    Table,
+    process_metadata,
+)
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import exactly_one, spooled_to_bytes_io_if_needed
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.CSV)
 def partition_csv(
     filename: Optional[str] = None,
     file: Optional[Union[IO, SpooledTemporaryFile]] = None,
     metadata_filename: Optional[str] = None,
     include_metadata: bool = True,
+    **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft Excel Documents in .csv format into its document elements.
 

diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py
@@ -2,17 +2,19 @@
 import tempfile
 from typing import IO, List, Optional
 
-from unstructured.documents.elements import Element
+from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import convert_office_doc, exactly_one
 from unstructured.partition.docx import partition_docx
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.DOC)
 def partition_doc(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
     include_page_breaks: bool = True,
+    **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft Word Documents in .doc format into its document elements.
 

diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -22,6 +22,7 @@
     Table,
     Text,
     Title,
+    process_metadata,
 )
 from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
 from unstructured.partition.common import (
@@ -102,12 +103,14 @@ def _get_runs(node, parent):
 Paragraph.runs = property(lambda self: _get_paragraph_runs(self))
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.DOCX)
 def partition_docx(
     filename: Optional[str] = None,
     file: Optional[Union[IO, SpooledTemporaryFile]] = None,
     metadata_filename: Optional[str] = None,
     include_page_breaks: bool = True,
+    **kwargs,
 ) -> List[Element]:
     """Partitions Microsoft Word Documents in .docx format into its document elements.
 

diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
@@ -29,6 +29,7 @@
     NarrativeText,
     Text,
     Title,
+    process_metadata,
 )
 from unstructured.documents.email_elements import (
     MetaData,
@@ -182,6 +183,7 @@ def find_embedded_image(
     return Image(text=image_info[:-1]), element
 
 
+@process_metadata()
 @add_metadata_with_filetype(FileType.EML)
 def partition_email(
     filename: Optional[str] = None,
@@ -190,6 +192,7 @@ def partition_email(
     content_source: str = "text/html",
     encoding: Optional[str] = None,
     include_headers: bool = False,
+    **kwargs,
 ) -> List[Element]:
     """Partitions an .eml documents into its constituent elements.
     Parameters