diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index c85f6c8495..af97aa86be 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -16,6 +16,7 @@ Element, ElementMetadata, NoID, + RegexMetadata, Text, ) @@ -189,6 +190,24 @@ def test_element_to_dict(): assert element.to_dict() == expected +def test_regex_metadata_round_trips_through_JSON(): + """metadata.regex_metadata should appear at full depth in JSON.""" + regex_metadata = { + "mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)], + "version": [ + RegexMetadata(text="current=v1.7.2", start=7, end=21), + RegexMetadata(text="supersedes=v1.7.2", start=22, end=40), + ], + } + metadata = ElementMetadata(regex_metadata=regex_metadata) + + metadata_json = json.dumps(metadata.to_dict()) + deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json)) + reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict()) + + assert reserialized_metadata_json == metadata_json + + def test_metadata_from_dict_extra_fields(): """ Assert that the metadata classes ignore nonexistent fields. diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index b27d0adbe2..fbcb8c56e5 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -1,4 +1,4 @@ -"""Implementation of chunking. +"""Implementation of chunking by title. Main entry point is the `@add_chunking_strategy()` decorator. """