diff --git a/CHANGELOG.md b/CHANGELOG.md index 6044d71244..828ef9c335 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * **Add `.metadata.is_continuation` to text-split chunks.** `.metadata.is_continuation=True` is added to second-and-later chunks formed by text-splitting an oversized `Table` element but not to their counterpart `Text` element splits. Add this indicator for `CompositeElement` to allow text-split continuation chunks to be identified for downstream processes that may wish to skip intentionally redundant metadata values in continuation chunks. * **Add `compound_structure_acc` metric to table eval.** Add a new property to `unstructured.metrics.table_eval.TableEvaluation`: `composite_structure_acc`, which is computed from the element level row and column index and content accuracy scores +* **Add `.metadata.orig_elements` to chunks.** `.metadata.orig_elements: list[Element]` is added to chunks during the chunking process (when requested) to allow access to information from the elements each chunk was formed from. This is useful for example to recover metadata fields that cannot be consolidated to a single value for a chunk, like `page_number`, `coordinates`, and `image_base64`. ### Features diff --git a/docs/source/apis/api_parameters.rst b/docs/source/apis/api_parameters.rst index 7e401b8877..fb8fbb7c58 100644 --- a/docs/source/apis/api_parameters.rst +++ b/docs/source/apis/api_parameters.rst @@ -39,6 +39,12 @@ encoding - **Description**: The encoding method used to decode the text input. Default: utf-8. - **Example**: utf-8 +extract_image_block_types +------------------------- +- **Type**: array +- **Description**: The types of image blocks to extract from the document. Supports various Element types. +- **Example**: ['Image', 'Table'] + hi_res_model_name ----------------- - **Type**: string @@ -48,7 +54,8 @@ hi_res_model_name include_page_breaks ------------------- - **Type**: boolean -- **Description**: If True, the output will include page breaks if the filetype supports it. Default: false. +- **Description**: When true, the output will include page break elements when the filetype supports + it. Default: false. languages --------- @@ -72,37 +79,66 @@ xml_keep_tags - **Type**: boolean - **Description**: If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml. + +Chunking Parameters +------------------- + +The following parameters control chunking behavior. Chunking is automatically performed after +partitioning when a value is provided for the ``chunking_strategy`` argument. The remaining chunking +parameters are only operative when a chunking strategy is specified. Note that not all chunking +parameters apply to all chunking strategies. Any chunking arguments not supported by the selected +chunker are ignored. + chunking_strategy ----------------- - **Type**: string -- **Description**: Use one of the supported strategies to chunk the returned elements. Currently supports: by_title. -- **Example**: by_title - -multipage_sections ------------------- -- **Type**: boolean -- **Description**: If chunking strategy is set, determines if sections can span multiple sections. Default: true. +- **Description**: Use one of the supported strategies to chunk the returned elements. When omitted, + no chunking is performed and any other chunking parameters provided are ignored. +- **Valid values**: ``"basic"``, ``"by_title"`` combine_under_n_chars --------------------- - **Type**: integer -- **Description**: If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500. +- **Applicable Chunkers**: "by_title" only +- **Description**: When chunking strategy is set to "by_title", combine small chunks until the + combined chunk reaches a length of n chars. This can mitigate the appearance of small chunks + created by short paragraphs, not intended as section headings, being identified as ``Title`` + elements in certain documents. +- **Default**: the same value as ``max_characters`` - **Example**: 500 -new_after_n_chars ------------------ -- **Type**: integer -- **Description**: If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500. -- **Example**: 1500 +include_orig_elements +--------------------- +- **Type**: boolean +- **Applicable Chunkers**: All +- **Description**: Add the elements used to form each chunk to ``.metadata.orig_elements`` for that + chunk. These can be used to recover the original text and metadata for individual elements when + that is required, for example to identify the page-numbers or coordinates spanned by a chunk. + When an element larger than ``max_characters`` is divided into two or more chunks via + text-splitting, each of those chunks will contain the entire original chunk as the only item in + its ``.metadata.orig_elements`` list. +- **Default**: true max_characters -------------- - **Type**: integer -- **Description**: If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 1500. -- **Example**: 1500 +- **Applicable Chunkers**: All +- **Description**: When chunking strategy is set, cut off new chunks after reaching a length of n + chars (hard max). +- **Default**: 500 -extract_image_block_types -------------------------- -- **Type**: array -- **Description**: The types of image blocks to extract from the document. Supports various Element types. -- **Example**: ['Image', 'Table'] +multipage_sections +------------------ +- **Type**: boolean +- **Applicable Chunkers**: "by_title" only +- **Description**: When true and chunking strategy is set to "by_title", allows a chunk to include + elements from more than one page. Otherwise chunks are broken on page boundaries. +- **Default**: true + +new_after_n_chars +----------------- +- **Type**: integer +- **Applicable Chunkers**: "basic", "by_title" +- **Description**: When chunking strategy is set, cut off new chunk after reaching a length of n + chars (soft max). +- **Default**: 1500 diff --git a/docs/source/core/chunking.rst b/docs/source/core/chunking.rst index 6196ffce72..ee23620d59 100644 --- a/docs/source/core/chunking.rst +++ b/docs/source/core/chunking.rst @@ -65,8 +65,8 @@ be specified when a non-default setting is required. Specific chunking strategie need to decide based on your use-case whether this option is right for you. -Chunking elements ------------------ +Chunking +-------- Chunking can be performed as part of partitioning or as a separate step after partitioning: @@ -170,3 +170,45 @@ following behaviors: ``combine_text_under_n_chars`` argument. This defaults to the same value as ``max_characters`` such that sequential small sections are combined to maximally fill the chunking window. Setting this to ``0`` will disable section combining. + + +Recovering Chunk Elements +------------------------- + +In general, a chunk consolidates multiple document elements to maximally fill a chunk of the desired +size. Information is naturally lost in this consolidation, for example which element a portion of +the text came from and certain metadata like page-number and coordinates which cannot always be +resolved to a single value. + +The original elements combined to make a chunk can be accessed using the `.metadata.orig_elements` +field on the chunk: + +.. code:: python + + >>> elements = [ + ... Title("Lorem Ipsum"), + ... NarrativeText("Lorem ipsum dolor sit."), + ... ] + >>> chunk = chunk_elements(elements)[0] + >>> print(chunk.text) + 'Lorem Ipsum\n\nLorem ipsum dolor sit.' + >>> print(chunk.metadata.orig_elements) + [Title("Lorem Ipsum"), NarrativeText("Lorem ipsum dolor sit.")] + +These elements will contain all their original metadata so can be used to access metadata that +cannot reliably be consolidated, for example: + +--code:: python + + >>> {e.metadata.page_number for e in chunk.metadata.orig_elements} + {2, 3} + + >>> [e.metadata.coordinates for e in chunk.metadata.orig_elements] + [, , ...] + + >>> [ + e.metadata.image_path + for e in chunk.metadata.orig_elements + if e.metadata.image_path is not None + ] + ['/tmp/lorem.jpg', '/tmp/ipsum.png'] diff --git a/docs/source/metadata.rst b/docs/source/metadata.rst index 2dcfd3b547..4a536910c0 100644 --- a/docs/source/metadata.rst +++ b/docs/source/metadata.rst @@ -7,9 +7,9 @@ Metadata ======== The ``unstructured`` package tracks a variety of metadata about Elements extracted from documents. -Tracking metadata enables users to filter document elements downstream based on element metadata of interest. -For example, a user may be interested in selected document elements from a given page number -or an e-mail with a given subject line. +Element metadata has a variety of uses including: +* filtering document elements based on an element metadata value, for example, elements from a given page number or an e-mail with a subject matching a regular expression. +* mapping an element to the document page where it occurred so that original page can be retrieved when that element matches search criteria. Metadata is tracked at the element level. You can extract the metadata for a given document element with ``element.metadata``. For a dictionary representation, use ``element.metadata.to_dict()``. @@ -136,34 +136,34 @@ returned. If the ``in_place`` flag is ``False``, only the altered coordinates ar Additional Metadata Fields by Document Type ########################################### -+-------------------------+---------------------+--------------------------------------------------------+ -| Field Name | Applicable Doc Types| Short Description | -+=========================+=====================+========================================================+ -| page_number | DOCX,PDF, PPT,XLSX | Page Number | -+-------------------------+---------------------+--------------------------------------------------------+ -| page_name | XLSX | Sheet Name in Excel document | -+-------------------------+---------------------+--------------------------------------------------------+ -| sent_from | EML | Email Sender | -+-------------------------+---------------------+--------------------------------------------------------+ -| sent_to | EML | Email Recipient | -+-------------------------+---------------------+--------------------------------------------------------+ -| subject | EML | Email Subject | -+-------------------------+---------------------+--------------------------------------------------------+ -| attached_to_filename | MSG | filename that attachment file is attached to | -+-------------------------+---------------------+--------------------------------------------------------+ -| header_footer_type | Word Doc | Pages a header or footer applies to: "primary", | -| | | "even_only", and "first_page" | -+-------------------------+---------------------+--------------------------------------------------------+ -| link_urls | HTML | The url associated with a link in a document. | -+-------------------------+---------------------+--------------------------------------------------------+ -| link_texts | HTML | The text associated with a link in a document. | -+-------------------------+---------------------+--------------------------------------------------------+ -| links | HTML | List of {”text”: “, “url”: } items. | -| | | Note: this element will be removed in the near future | -| | | in favor of the above link_urls and link_texts. | -+-------------------------+---------------------+--------------------------------------------------------+ -| section | EPUB | Book section title corresponding to table of contents | -+-------------------------+---------------------+--------------------------------------------------------+ ++-------------------------+-----------------------+--------------------------------------------------------+ +| Field Name | Applicable Doc Types | Short Description | ++=========================+=======================+========================================================+ +| page_number | DOCX, PDF, PPT, XLSX | Page Number | ++-------------------------+-----------------------+--------------------------------------------------------+ +| page_name | XLSX | Sheet Name in Excel document | ++-------------------------+-----------------------+--------------------------------------------------------+ +| sent_from | EML | Email Sender | ++-------------------------+-----------------------+--------------------------------------------------------+ +| sent_to | EML | Email Recipient | ++-------------------------+-----------------------+--------------------------------------------------------+ +| subject | EML | Email Subject | ++-------------------------+-----------------------+--------------------------------------------------------+ +| attached_to_filename | MSG | filename that attachment file is attached to | ++-------------------------+-----------------------+--------------------------------------------------------+ +| header_footer_type | Word Doc | Pages a header or footer applies to: "primary", | +| | | "even_only", and "first_page" | ++-------------------------+-----------------------+--------------------------------------------------------+ +| link_urls | HTML | The url associated with a link in a document. | ++-------------------------+-----------------------+--------------------------------------------------------+ +| link_texts | HTML | The text associated with a link in a document. | ++-------------------------+-----------------------+--------------------------------------------------------+ +| links | HTML | List of {”text”: “, “url”: } items. | +| | | Note: this element will be removed in the near future | +| | | in favor of the above link_urls and link_texts. | ++-------------------------+-----------------------+--------------------------------------------------------+ +| section | EPUB | Book section title corresponding to table of contents | ++-------------------------+-----------------------+--------------------------------------------------------+ :raw-html:`
` Notes on additional metadata by document type: diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py index 3bdfb2eccf..61de0f54d4 100644 --- a/test_unstructured/documents/test_elements.py +++ b/test_unstructured/documents/test_elements.py @@ -27,6 +27,7 @@ Points, RegexMetadata, Text, + Title, ) @@ -381,6 +382,22 @@ def and_it_serializes_a_data_source_sub_object_to_a_dict_when_it_is_present(self "page_number": 2, } + def and_it_serializes_an_orig_elements_sub_object_to_base64_when_it_is_present(self): + meta = ElementMetadata( + category_depth=1, + orig_elements=[Title("Lorem"), Text("Lorem Ipsum")], + page_number=2, + ) + assert meta.to_dict() == { + "category_depth": 1, + "orig_elements": ( + "eJyFzcsKwjAQheFXKVm7yDS3xjcQXNaViKTJjBR6o46glr67zVI3Lmf4Dv95EdhhjwNf2yT2hYDGUaWt" + "JVm5WDoqNUL0UoJrqtLHJHaF6JFDChw2v6zbzfjkvD2OM/YZ8GvC/Khb7lBs5LcilUwRyCsblQYTiBQp" + "ZRxYZcCA/1spDtP98dU6DTEw3sa5fWOqs10vH0cLQn0=" + ), + "page_number": 2, + } + def but_unlike_in_ElementMetadata_unknown_fields_in_sub_objects_are_ignored(self): """Metadata sub-objects ignore fields they do not explicitly define. diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py index 466eeb44bb..8c977a24d3 100644 --- a/test_unstructured/staging/test_base.py +++ b/test_unstructured/staging/test_base.py @@ -31,6 +31,28 @@ from unstructured.staging import base +def test_base64_gzipped_json_to_elements_can_deserialize_compressed_elements_from_a_JSON_string(): + base64_elements_str = ( + "eJyFzcsKwjAQheFXKVm7yDS3xjcQXNaViKTJjBR6o46glr67zVI3Lmf4Dv95EdhhjwNf2yT2hYDGUaWtJVm5WDoq" + "NUL0UoJrqtLHJHaF6JFDChw2v6zbzfjkvD2OM/YZ8GvC/Khb7lBs5LcilUwRyCsblQYTiBQpZRxYZcCA/1spDtP9" + "8dU6DTEw3sa5fWOqs10vH0cLQn0=" + ) + + elements = base.elements_from_base64_gzipped_json(base64_elements_str) + + assert elements == [Title("Lorem"), Text("Lorem Ipsum")] + + +def test_elements_to_base64_gzipped_json_can_serialize_elements_to_a_base64_str(): + elements = [Title("Lorem"), Text("Lorem Ipsum")] + + assert base.elements_to_base64_gzipped_json(elements) == ( + "eJyFzcsKwjAQheFXKVm7yDS3xjcQXNaViKTJjBR6o46glr67zVI3Lmf4Dv95EdhhjwNf2yT2hYDGUaWtJVm5WDoq" + "NUL0UoJrqtLHJHaF6JFDChw2v6zbzfjkvD2OM/YZ8GvC/Khb7lBs5LcilUwRyCsblQYTiBQpZRxYZcCA/1spDtP9" + "8dU6DTEw3sa5fWOqs10vH0cLQn0=" + ) + + def test_elements_to_dicts(): elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")] isd = base.elements_to_dicts(elements) diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json index 9a1ea65d24..275ada2050 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json @@ -24,7 +24,8 @@ "languages": [ "eng" ], - "page_number": 1 + "page_number": 1, + "orig_elements": "eJztlFFv2yAQx7+KxfNiY5Ng07eqq5S+NFGaPLWRheFIrNpgYbyli/rdB2mjpVMeNm176FTJkn3H/+DP/XS+3yNooAXtylqiiwiNc5ZKyYqMMiUqjMdcSkpA5TkRtKA5+hShFhyX3HGv36PwUfZmsAIOcQe2rfu+NrovX0X3e9QaGZYJSYviee33GGwTjku2poXEDlqDTb4a+5gMund2EG6wIN8GDnpXnmbKWm98LonjBHa87RoYSSP6ZMu1rIx5HKVd7BM79OwPVHUD7qkLLhDvuqYW3HmTyRctY9OB3rWNMrblrh8ZpWoBvnIIjYm9LdlZI8DfSm/aJj6uhF5sgUuwpTLG+dfxgM7WLbdPQdBwvRm49xkagUBv0DrYcbBzQbm6i5Z26B1ANH21HaqOG00P24cL/AyqAMkwZePJBGNGCo5pmjFBcE6JIhjDuwUFbbflff3Nl4UmlcJo56/90r+r6eV8eb2IUrQ+I3V88yKrDsv/CPkZoj7b+Uyph7byuC6i9JTxD9MnZJe1a+AcWDIu/IMJzWiuOB5TqSibUEkrpWgm2P8J9uZ2uZh9Xl0tb2a374ntG9+/gpczVagsy4DwrEi5YtyDpgVRRYjSjx/sX5u26GHIcEqi30YE+bjCqpI5VTRnueAym1QgcOoH0I9h9oHoDxFdxg9uvlrMZ3fXZ4CsvwOuEu/w" } }, { @@ -44,7 +45,8 @@ "languages": [ "eng" ], - "page_number": 1 + "page_number": 1, + "orig_elements": "eJx1VF1v2zoM/SuEn1IgdZombbJ7326B4T4Nw9o9bUNAS3QsVJYMiWoaFP3vI51k65YNCGKZ4sfh4aG/vFTkqafAG2erf6BaLolWy2VLdnm1vGne3Rhavru+Xa1xZVfrK6ymUPXEaJFR/F8qPWxyLMnQ+D5Q6l3OLoa8OTp9ean6aPV6sZiv16/fJEdJXsvNutjTLJUQKM12MT3OSsiciuGSyP76wpR589aycWErtlldz+gZ+8HTpY0mzzoMtonx8XI+1GJ4rl6lYOs88X5QFBUOg3cGWUDOnoKt40DhufdtTD1yvoxt6wxJZFFiaoFlhxQNSVdh2/v6dKNceAzbggJD+6wobCvtbhDLJpS+oST2udZnemat/dARfA6OycI9o/QED6lkJgJBFV3gDAIfchEin1yW+8xikMrAB8eDQx8lSUxn3gb1vwRLCUyHA8tzvoDYAjv2BPP5eD5DcScTqgGu1/C5vq/vavharq5wBTfr20lzITcK/P8js1OQERcJxgwDJtaUIgGwhd2P6n9KNQUZGDbe5U78okD0mFyrQQppiNnpVP4M8UTUxxS3CXuYHA8XIBHqfiwfdxLT7AHPqAOOB0dqlLspGJGROxyjXKSxG83hgvyEO4E7HSn+K5zfqAEb5S5EhkGCRSUCoy3ej0lMVJUyKTAe1+7Uqcfdv+AYcheLt2N8Q1DygWPppDRZBlgkVmQKnrboQQtgMt2YGwP6fXb5iOes9V4emrPF3nmHCXaOO0nh6QmDoo0iIF3bE6T/MDymMrDZj+KYwnuSsUrZT8XT6PXG46MuiJWthMl4faGs7cFHIwHNT7+klwdGVanatwAWUf4qlcX1fPoXAYn1rn5ff3pjrW8neDFZqErvz5dFMFEwIk9ZSqsKaKP3cSeQ0bAsOjy4QQZuZRqqRKuz587lnyNtCisjAleT6WyM8CQRoh7oxQtFQvtavwenT8wHTEk+ME/0oHv/+u07sfvfaA==" } }, { @@ -65,6 +67,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJx1VF1v2zoM/SuEn1IgdZombbJ7326B4T4Nw9o9bUNAS3QsVJYMiWoaFP3vI51k65YNCGKZ4sfh4aG/vFTkqafAG2erf6BaLolWy2VLdnm1vGne3Rhavru+Xa1xZVfrK6ymUPXEaJFR/F8qPWxyLMnQ+D5Q6l3OLoa8OTp9ean6aPV6sZiv16/fJEdJXsvNutjTLJUQKM12MT3OSsiciuGSyP76wpR589aycWErtlldz+gZ+8HTpY0mzzoMtonx8XI+1GJ4rl6lYOs88X5QFBUOg3cGWUDOnoKt40DhufdtTD1yvoxt6wxJZFFiaoFlhxQNSVdh2/v6dKNceAzbggJD+6wobCvtbhDLJpS+oST2udZnemat/dARfA6OycI9o/QED6lkJgJBFV3gDAIfchEin1yW+8xikMrAB8eDQx8lSUxn3gb1vwRLCUyHA8tzvoDYAjv2BPP5eD5DcScTqgGu1/C5vq/vavharq5wBTfr20lzITcK/P8js1OQERcJxgwDJtaUIgGwhd2P6n9KNQUZGDbe5U78okD0mFyrQQppiNnpVP4M8UTUxxS3CXuYHA8XIBHqfiwfdxLT7AHPqAOOB0dqlLspGJGROxyjXKSxG83hgvyEO4E7HSn+K5zfqAEb5S5EhkGCRSUCoy3ej0lMVJUyKTAe1+7Uqcfdv+AYcheLt2N8Q1DygWPppDRZBlgkVmQKnrboQQtgMt2YGwP6fXb5iOes9V4emrPF3nmHCXaOO0nh6QmDoo0iIF3bE6T/MDymMrDZj+KYwnuSsUrZT8XT6PXG46MuiJWthMl4faGs7cFHIwHNT7+klwdGVanatwAWUf4qlcX1fPoXAYn1rn5ff3pjrW8neDFZqErvz5dFMFEwIk9ZSqsKaKP3cSeQ0bAsOjy4QQZuZRqqRKuz587lnyNtCisjAleT6WyM8CQRoh7oxQtFQvtavwenT8wHTEk+ME/0oHv/+u07sfvfaA==", "is_continuation": true } }, @@ -86,6 +89,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJx1VF1v2zoM/SuEn1IgdZombbJ7326B4T4Nw9o9bUNAS3QsVJYMiWoaFP3vI51k65YNCGKZ4sfh4aG/vFTkqafAG2erf6BaLolWy2VLdnm1vGne3Rhavru+Xa1xZVfrK6ymUPXEaJFR/F8qPWxyLMnQ+D5Q6l3OLoa8OTp9ean6aPV6sZiv16/fJEdJXsvNutjTLJUQKM12MT3OSsiciuGSyP76wpR589aycWErtlldz+gZ+8HTpY0mzzoMtonx8XI+1GJ4rl6lYOs88X5QFBUOg3cGWUDOnoKt40DhufdtTD1yvoxt6wxJZFFiaoFlhxQNSVdh2/v6dKNceAzbggJD+6wobCvtbhDLJpS+oST2udZnemat/dARfA6OycI9o/QED6lkJgJBFV3gDAIfchEin1yW+8xikMrAB8eDQx8lSUxn3gb1vwRLCUyHA8tzvoDYAjv2BPP5eD5DcScTqgGu1/C5vq/vavharq5wBTfr20lzITcK/P8js1OQERcJxgwDJtaUIgGwhd2P6n9KNQUZGDbe5U78okD0mFyrQQppiNnpVP4M8UTUxxS3CXuYHA8XIBHqfiwfdxLT7AHPqAOOB0dqlLspGJGROxyjXKSxG83hgvyEO4E7HSn+K5zfqAEb5S5EhkGCRSUCoy3ej0lMVJUyKTAe1+7Uqcfdv+AYcheLt2N8Q1DygWPppDRZBlgkVmQKnrboQQtgMt2YGwP6fXb5iOes9V4emrPF3nmHCXaOO0nh6QmDoo0iIF3bE6T/MDymMrDZj+KYwnuSsUrZT8XT6PXG46MuiJWthMl4faGs7cFHIwHNT7+klwdGVanatwAWUf4qlcX1fPoXAYn1rn5ff3pjrW8neDFZqErvz5dFMFEwIk9ZSqsKaKP3cSeQ0bAsOjy4QQZuZRqqRKuz587lnyNtCisjAleT6WyM8CQRoh7oxQtFQvtavwenT8wHTEk+ME/0oHv/+u07sfvfaA==", "is_continuation": true } }, @@ -107,6 +111,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJx1VF1v2zoM/SuEn1IgdZombbJ7326B4T4Nw9o9bUNAS3QsVJYMiWoaFP3vI51k65YNCGKZ4sfh4aG/vFTkqafAG2erf6BaLolWy2VLdnm1vGne3Rhavru+Xa1xZVfrK6ymUPXEaJFR/F8qPWxyLMnQ+D5Q6l3OLoa8OTp9ean6aPV6sZiv16/fJEdJXsvNutjTLJUQKM12MT3OSsiciuGSyP76wpR589aycWErtlldz+gZ+8HTpY0mzzoMtonx8XI+1GJ4rl6lYOs88X5QFBUOg3cGWUDOnoKt40DhufdtTD1yvoxt6wxJZFFiaoFlhxQNSVdh2/v6dKNceAzbggJD+6wobCvtbhDLJpS+oST2udZnemat/dARfA6OycI9o/QED6lkJgJBFV3gDAIfchEin1yW+8xikMrAB8eDQx8lSUxn3gb1vwRLCUyHA8tzvoDYAjv2BPP5eD5DcScTqgGu1/C5vq/vavharq5wBTfr20lzITcK/P8js1OQERcJxgwDJtaUIgGwhd2P6n9KNQUZGDbe5U78okD0mFyrQQppiNnpVP4M8UTUxxS3CXuYHA8XIBHqfiwfdxLT7AHPqAOOB0dqlLspGJGROxyjXKSxG83hgvyEO4E7HSn+K5zfqAEb5S5EhkGCRSUCoy3ej0lMVJUyKTAe1+7Uqcfdv+AYcheLt2N8Q1DygWPppDRZBlgkVmQKnrboQQtgMt2YGwP6fXb5iOes9V4emrPF3nmHCXaOO0nh6QmDoo0iIF3bE6T/MDymMrDZj+KYwnuSsUrZT8XT6PXG46MuiJWthMl4faGs7cFHIwHNT7+klwdGVanatwAWUf4qlcX1fPoXAYn1rn5ff3pjrW8neDFZqErvz5dFMFEwIk9ZSqsKaKP3cSeQ0bAsOjy4QQZuZRqqRKuz587lnyNtCisjAleT6WyM8CQRoh7oxQtFQvtavwenT8wHTEk+ME/0oHv/+u07sfvfaA==", "is_continuation": true } }, @@ -128,6 +133,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJx1VF1v2zoM/SuEn1IgdZombbJ7326B4T4Nw9o9bUNAS3QsVJYMiWoaFP3vI51k65YNCGKZ4sfh4aG/vFTkqafAG2erf6BaLolWy2VLdnm1vGne3Rhavru+Xa1xZVfrK6ymUPXEaJFR/F8qPWxyLMnQ+D5Q6l3OLoa8OTp9ean6aPV6sZiv16/fJEdJXsvNutjTLJUQKM12MT3OSsiciuGSyP76wpR589aycWErtlldz+gZ+8HTpY0mzzoMtonx8XI+1GJ4rl6lYOs88X5QFBUOg3cGWUDOnoKt40DhufdtTD1yvoxt6wxJZFFiaoFlhxQNSVdh2/v6dKNceAzbggJD+6wobCvtbhDLJpS+oST2udZnemat/dARfA6OycI9o/QED6lkJgJBFV3gDAIfchEin1yW+8xikMrAB8eDQx8lSUxn3gb1vwRLCUyHA8tzvoDYAjv2BPP5eD5DcScTqgGu1/C5vq/vavharq5wBTfr20lzITcK/P8js1OQERcJxgwDJtaUIgGwhd2P6n9KNQUZGDbe5U78okD0mFyrQQppiNnpVP4M8UTUxxS3CXuYHA8XIBHqfiwfdxLT7AHPqAOOB0dqlLspGJGROxyjXKSxG83hgvyEO4E7HSn+K5zfqAEb5S5EhkGCRSUCoy3ej0lMVJUyKTAe1+7Uqcfdv+AYcheLt2N8Q1DygWPppDRZBlgkVmQKnrboQQtgMt2YGwP6fXb5iOes9V4emrPF3nmHCXaOO0nh6QmDoo0iIF3bE6T/MDymMrDZj+KYwnuSsUrZT8XT6PXG46MuiJWthMl4faGs7cFHIwHNT7+klwdGVanatwAWUf4qlcX1fPoXAYn1rn5ff3pjrW8neDFZqErvz5dFMFEwIk9ZSqsKaKP3cSeQ0bAsOjy4QQZuZRqqRKuz587lnyNtCisjAleT6WyM8CQRoh7oxQtFQvtavwenT8wHTEk+ME/0oHv/+u07sfvfaA==", "is_continuation": true } }, @@ -149,6 +155,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJx1VF1v2zoM/SuEn1IgdZombbJ7326B4T4Nw9o9bUNAS3QsVJYMiWoaFP3vI51k65YNCGKZ4sfh4aG/vFTkqafAG2erf6BaLolWy2VLdnm1vGne3Rhavru+Xa1xZVfrK6ymUPXEaJFR/F8qPWxyLMnQ+D5Q6l3OLoa8OTp9ean6aPV6sZiv16/fJEdJXsvNutjTLJUQKM12MT3OSsiciuGSyP76wpR589aycWErtlldz+gZ+8HTpY0mzzoMtonx8XI+1GJ4rl6lYOs88X5QFBUOg3cGWUDOnoKt40DhufdtTD1yvoxt6wxJZFFiaoFlhxQNSVdh2/v6dKNceAzbggJD+6wobCvtbhDLJpS+oST2udZnemat/dARfA6OycI9o/QED6lkJgJBFV3gDAIfchEin1yW+8xikMrAB8eDQx8lSUxn3gb1vwRLCUyHA8tzvoDYAjv2BPP5eD5DcScTqgGu1/C5vq/vavharq5wBTfr20lzITcK/P8js1OQERcJxgwDJtaUIgGwhd2P6n9KNQUZGDbe5U78okD0mFyrQQppiNnpVP4M8UTUxxS3CXuYHA8XIBHqfiwfdxLT7AHPqAOOB0dqlLspGJGROxyjXKSxG83hgvyEO4E7HSn+K5zfqAEb5S5EhkGCRSUCoy3ej0lMVJUyKTAe1+7Uqcfdv+AYcheLt2N8Q1DygWPppDRZBlgkVmQKnrboQQtgMt2YGwP6fXb5iOes9V4emrPF3nmHCXaOO0nh6QmDoo0iIF3bE6T/MDymMrDZj+KYwnuSsUrZT8XT6PXG46MuiJWthMl4faGs7cFHIwHNT7+klwdGVanatwAWUf4qlcX1fPoXAYn1rn5ff3pjrW8neDFZqErvz5dFMFEwIk9ZSqsKaKP3cSeQ0bAsOjy4QQZuZRqqRKuz587lnyNtCisjAleT6WyM8CQRoh7oxQtFQvtavwenT8wHTEk+ME/0oHv/+u07sfvfaA==", "is_continuation": true } }, @@ -170,6 +177,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJx1VF1v2zoM/SuEn1IgdZombbJ7326B4T4Nw9o9bUNAS3QsVJYMiWoaFP3vI51k65YNCGKZ4sfh4aG/vFTkqafAG2erf6BaLolWy2VLdnm1vGne3Rhavru+Xa1xZVfrK6ymUPXEaJFR/F8qPWxyLMnQ+D5Q6l3OLoa8OTp9ean6aPV6sZiv16/fJEdJXsvNutjTLJUQKM12MT3OSsiciuGSyP76wpR589aycWErtlldz+gZ+8HTpY0mzzoMtonx8XI+1GJ4rl6lYOs88X5QFBUOg3cGWUDOnoKt40DhufdtTD1yvoxt6wxJZFFiaoFlhxQNSVdh2/v6dKNceAzbggJD+6wobCvtbhDLJpS+oST2udZnemat/dARfA6OycI9o/QED6lkJgJBFV3gDAIfchEin1yW+8xikMrAB8eDQx8lSUxn3gb1vwRLCUyHA8tzvoDYAjv2BPP5eD5DcScTqgGu1/C5vq/vavharq5wBTfr20lzITcK/P8js1OQERcJxgwDJtaUIgGwhd2P6n9KNQUZGDbe5U78okD0mFyrQQppiNnpVP4M8UTUxxS3CXuYHA8XIBHqfiwfdxLT7AHPqAOOB0dqlLspGJGROxyjXKSxG83hgvyEO4E7HSn+K5zfqAEb5S5EhkGCRSUCoy3ej0lMVJUyKTAe1+7Uqcfdv+AYcheLt2N8Q1DygWPppDRZBlgkVmQKnrboQQtgMt2YGwP6fXb5iOes9V4emrPF3nmHCXaOO0nh6QmDoo0iIF3bE6T/MDymMrDZj+KYwnuSsUrZT8XT6PXG46MuiJWthMl4faGs7cFHIwHNT7+klwdGVanatwAWUf4qlcX1fPoXAYn1rn5ff3pjrW8neDFZqErvz5dFMFEwIk9ZSqsKaKP3cSeQ0bAsOjy4QQZuZRqqRKuz587lnyNtCisjAleT6WyM8CQRoh7oxQtFQvtavwenT8wHTEk+ME/0oHv/+u07sfvfaA==", "is_continuation": true } }, @@ -191,6 +199,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJx1VF1v2zoM/SuEn1IgdZombbJ7326B4T4Nw9o9bUNAS3QsVJYMiWoaFP3vI51k65YNCGKZ4sfh4aG/vFTkqafAG2erf6BaLolWy2VLdnm1vGne3Rhavru+Xa1xZVfrK6ymUPXEaJFR/F8qPWxyLMnQ+D5Q6l3OLoa8OTp9ean6aPV6sZiv16/fJEdJXsvNutjTLJUQKM12MT3OSsiciuGSyP76wpR589aycWErtlldz+gZ+8HTpY0mzzoMtonx8XI+1GJ4rl6lYOs88X5QFBUOg3cGWUDOnoKt40DhufdtTD1yvoxt6wxJZFFiaoFlhxQNSVdh2/v6dKNceAzbggJD+6wobCvtbhDLJpS+oST2udZnemat/dARfA6OycI9o/QED6lkJgJBFV3gDAIfchEin1yW+8xikMrAB8eDQx8lSUxn3gb1vwRLCUyHA8tzvoDYAjv2BPP5eD5DcScTqgGu1/C5vq/vavharq5wBTfr20lzITcK/P8js1OQERcJxgwDJtaUIgGwhd2P6n9KNQUZGDbe5U78okD0mFyrQQppiNnpVP4M8UTUxxS3CXuYHA8XIBHqfiwfdxLT7AHPqAOOB0dqlLspGJGROxyjXKSxG83hgvyEO4E7HSn+K5zfqAEb5S5EhkGCRSUCoy3ej0lMVJUyKTAe1+7Uqcfdv+AYcheLt2N8Q1DygWPppDRZBlgkVmQKnrboQQtgMt2YGwP6fXb5iOes9V4emrPF3nmHCXaOO0nh6QmDoo0iIF3bE6T/MDymMrDZj+KYwnuSsUrZT8XT6PXG46MuiJWthMl4faGs7cFHIwHNT7+klwdGVanatwAWUf4qlcX1fPoXAYn1rn5ff3pjrW8neDFZqErvz5dFMFEwIk9ZSqsKaKP3cSeQ0bAsOjy4QQZuZRqqRKuz587lnyNtCisjAleT6WyM8CQRoh7oxQtFQvtavwenT8wHTEk+ME/0oHv/+u07sfvfaA==", "is_continuation": true } }, @@ -211,7 +220,8 @@ "languages": [ "eng" ], - "page_number": 1 + "page_number": 1, + "orig_elements": "eJyVUstu2zAQ/JWFzrZk+VG7vbW59JRD45zSQKDIlUWYIlVyaVsw8u9dynbhBkGB3sh9zczOvpwzNNihpUqr7AtkS1GvllKsN5vVUiyUWMyb+Vys1p9WtSo/L2U2gaxDEkqQ4Ppzlh5VcNFLHP89+k6HoJ0N1bXo5Zx1TqX0YlFuNm+vPCN6k+CK1nVY+Ggt+uLo/L6INpCPkqJH9feHMFB1H6m03XGsyPMCT6LrDU6Vk6FohVW1c/tp2eccOGVvDNhogzT0iUUm+t5oKYhJFgerctejPXWmcb4TFKauabRE7oxpMTnTUr13ElmV3XUmv2XSLoywuyiYRtKZod1lSV3PkcrGrkbP8TLhE54oYT86ankMaAv8CPD9ShZC66JRUCNIN4pEBeQATzIG5FqEQFybejkZCBEa7zquZuVDCh81tSCMARVJYwDd9S7wlHoY278Ju/exJznAA9sBPAx+RINhAsZJYcBfPinOktEHcM3YKdleygG2H5G48raOdHMBeraaGPaJBFsG22vdsUWLB/RjCe/zoNOR3CD+rIHFN2wOhYuclLtjx8T+hxvvV0iusCRqw/vi7rv9BKQUYpT3ZkygjgQWcdQFfNPBWe4c0jOdCS9pSGMGiJZpMRvm4o+andJMgW8LFZP6ymBMiiI5P4DHBj1amax57/6YS35/4NQEyhKe86f8IYefcTYTayhnJTD7gL/yyb8opBO9Xf2j8J5v/oDbdIpvr78BBqlrWQ==" } }, { @@ -232,6 +242,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJyVUstu2zAQ/JWFzrZk+VG7vbW59JRD45zSQKDIlUWYIlVyaVsw8u9dynbhBkGB3sh9zczOvpwzNNihpUqr7AtkS1GvllKsN5vVUiyUWMyb+Vys1p9WtSo/L2U2gaxDEkqQ4Ppzlh5VcNFLHP89+k6HoJ0N1bXo5Zx1TqX0YlFuNm+vPCN6k+CK1nVY+Ggt+uLo/L6INpCPkqJH9feHMFB1H6m03XGsyPMCT6LrDU6Vk6FohVW1c/tp2eccOGVvDNhogzT0iUUm+t5oKYhJFgerctejPXWmcb4TFKauabRE7oxpMTnTUr13ElmV3XUmv2XSLoywuyiYRtKZod1lSV3PkcrGrkbP8TLhE54oYT86ankMaAv8CPD9ShZC66JRUCNIN4pEBeQATzIG5FqEQFybejkZCBEa7zquZuVDCh81tSCMARVJYwDd9S7wlHoY278Ju/exJznAA9sBPAx+RINhAsZJYcBfPinOktEHcM3YKdleygG2H5G48raOdHMBeraaGPaJBFsG22vdsUWLB/RjCe/zoNOR3CD+rIHFN2wOhYuclLtjx8T+hxvvV0iusCRqw/vi7rv9BKQUYpT3ZkygjgQWcdQFfNPBWe4c0jOdCS9pSGMGiJZpMRvm4o+andJMgW8LFZP6ymBMiiI5P4DHBj1amax57/6YS35/4NQEyhKe86f8IYefcTYTayhnJTD7gL/yyb8opBO9Xf2j8J5v/oDbdIpvr78BBqlrWQ==", "is_continuation": true } }, @@ -253,6 +264,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJyVUstu2zAQ/JWFzrZk+VG7vbW59JRD45zSQKDIlUWYIlVyaVsw8u9dynbhBkGB3sh9zczOvpwzNNihpUqr7AtkS1GvllKsN5vVUiyUWMyb+Vys1p9WtSo/L2U2gaxDEkqQ4Ppzlh5VcNFLHP89+k6HoJ0N1bXo5Zx1TqX0YlFuNm+vPCN6k+CK1nVY+Ggt+uLo/L6INpCPkqJH9feHMFB1H6m03XGsyPMCT6LrDU6Vk6FohVW1c/tp2eccOGVvDNhogzT0iUUm+t5oKYhJFgerctejPXWmcb4TFKauabRE7oxpMTnTUr13ElmV3XUmv2XSLoywuyiYRtKZod1lSV3PkcrGrkbP8TLhE54oYT86ankMaAv8CPD9ShZC66JRUCNIN4pEBeQATzIG5FqEQFybejkZCBEa7zquZuVDCh81tSCMARVJYwDd9S7wlHoY278Ju/exJznAA9sBPAx+RINhAsZJYcBfPinOktEHcM3YKdleygG2H5G48raOdHMBeraaGPaJBFsG22vdsUWLB/RjCe/zoNOR3CD+rIHFN2wOhYuclLtjx8T+hxvvV0iusCRqw/vi7rv9BKQUYpT3ZkygjgQWcdQFfNPBWe4c0jOdCS9pSGMGiJZpMRvm4o+andJMgW8LFZP6ymBMiiI5P4DHBj1amax57/6YS35/4NQEyhKe86f8IYefcTYTayhnJTD7gL/yyb8opBO9Xf2j8J5v/oDbdIpvr78BBqlrWQ==", "is_continuation": true } }, @@ -274,6 +286,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJyVUstu2zAQ/JWFzrZk+VG7vbW59JRD45zSQKDIlUWYIlVyaVsw8u9dynbhBkGB3sh9zczOvpwzNNihpUqr7AtkS1GvllKsN5vVUiyUWMyb+Vys1p9WtSo/L2U2gaxDEkqQ4Ppzlh5VcNFLHP89+k6HoJ0N1bXo5Zx1TqX0YlFuNm+vPCN6k+CK1nVY+Ggt+uLo/L6INpCPkqJH9feHMFB1H6m03XGsyPMCT6LrDU6Vk6FohVW1c/tp2eccOGVvDNhogzT0iUUm+t5oKYhJFgerctejPXWmcb4TFKauabRE7oxpMTnTUr13ElmV3XUmv2XSLoywuyiYRtKZod1lSV3PkcrGrkbP8TLhE54oYT86ankMaAv8CPD9ShZC66JRUCNIN4pEBeQATzIG5FqEQFybejkZCBEa7zquZuVDCh81tSCMARVJYwDd9S7wlHoY278Ju/exJznAA9sBPAx+RINhAsZJYcBfPinOktEHcM3YKdleygG2H5G48raOdHMBeraaGPaJBFsG22vdsUWLB/RjCe/zoNOR3CD+rIHFN2wOhYuclLtjx8T+hxvvV0iusCRqw/vi7rv9BKQUYpT3ZkygjgQWcdQFfNPBWe4c0jOdCS9pSGMGiJZpMRvm4o+andJMgW8LFZP6ymBMiiI5P4DHBj1amax57/6YS35/4NQEyhKe86f8IYefcTYTayhnJTD7gL/yyb8opBO9Xf2j8J5v/oDbdIpvr78BBqlrWQ==", "is_continuation": true } }, @@ -295,6 +308,7 @@ "eng" ], "page_number": 1, + "orig_elements": "eJyVUstu2zAQ/JWFzrZk+VG7vbW59JRD45zSQKDIlUWYIlVyaVsw8u9dynbhBkGB3sh9zczOvpwzNNihpUqr7AtkS1GvllKsN5vVUiyUWMyb+Vys1p9WtSo/L2U2gaxDEkqQ4Ppzlh5VcNFLHP89+k6HoJ0N1bXo5Zx1TqX0YlFuNm+vPCN6k+CK1nVY+Ggt+uLo/L6INpCPkqJH9feHMFB1H6m03XGsyPMCT6LrDU6Vk6FohVW1c/tp2eccOGVvDNhogzT0iUUm+t5oKYhJFgerctejPXWmcb4TFKauabRE7oxpMTnTUr13ElmV3XUmv2XSLoywuyiYRtKZod1lSV3PkcrGrkbP8TLhE54oYT86ankMaAv8CPD9ShZC66JRUCNIN4pEBeQATzIG5FqEQFybejkZCBEa7zquZuVDCh81tSCMARVJYwDd9S7wlHoY278Ju/exJznAA9sBPAx+RINhAsZJYcBfPinOktEHcM3YKdleygG2H5G48raOdHMBeraaGPaJBFsG22vdsUWLB/RjCe/zoNOR3CD+rIHFN2wOhYuclLtjx8T+hxvvV0iusCRqw/vi7rv9BKQUYpT3ZkygjgQWcdQFfNPBWe4c0jOdCS9pSGMGiJZpMRvm4o+andJMgW8LFZP6ymBMiiI5P4DHBj1amax57/6YS35/4NQEyhKe86f8IYefcTYTayhnJTD7gL/yyb8opBO9Xf2j8J5v/oDbdIpvr78BBqlrWQ==", "is_continuation": true } }, @@ -315,7 +329,8 @@ "languages": [ "eng" ], - "page_number": 1 + "page_number": 1, + "orig_elements": "eJztkrGO2zAMhl9F0NzYcewkdsdOnYoDLp0OB4OWKFuIJRkSnaYI8u6lihyQdukLdJN+kuLHX3y7SZzRoafeavlZyMOxaevd8QD7bVVp02FT74ftvgF9NBoPKD8J6ZBAAwHn32Q+9CmsUeHv+4LR2ZRs8Kl/JL3dpAs6h+u6atv7O7+xxjm3K6fgsIyr9xjLHyGey9UniquiNaL+80KYqH9WeutH1sqiKPEKbplxo4NK5QReDyGcN9VSsHCVd25o7Iz0c8kUEpZltgqIIcuL10VY0F/dbEJ0QGkTjLEKuXLNxhSMpZcYFPJUfnRz8RHJXszgxxUYI88p0Y8yT7ew0vvVDRhZr3J/wivl3qfJJvH1QSh0wCR8IKEiAqEArW3GgllEO06UBIxgeWZBE4pEXMcMgi1IhChCFN+9JdTilbg8idMjYL0wcOFwMCJwaRQLRLKYigz94cM3iJFduOApwzHl39uA9bBVTbPrWqMqVTUdmME0qu26gxm6XfN/G/65Dc9//+z9C6d94U8/y/v7L0G/O2U=" } }, { @@ -335,7 +350,8 @@ "languages": [ "eng" ], - "page_number": 2 + "page_number": 2, + "orig_elements": "eJxVkMtOwzAQRX8l8prGLQltyg5EEEiISm26KlXk2JPUql/yA4Kq/js2aiXYec6M5947uxMCARKUbzlD9xkql/M5XVTTsiOkK/rF3awoq64ol8ViCrQg6CZDEjxhxJM4f0Lp0TodLIXf2oCV3DmulWsvQ7sTkpqldlHMquq8jzuCFUkOH7QEbINSYPGXtkcclPM2UB8ssP+FB+fbv6TlaogM5zmGkUgjYMI0dfhAFOu0Pk5mJo9gROco2HMB/tskF4gYIzglPprEn4rl2oAapei1lcS7ie57TiH+DOkwebTFjNUUYio1SJFfO+kWgqghkGgj5USgBpTSmUhaFWQHNvLbpO9h9En7Mf/w69Vbna2es+alzrbvr039lG2ah6beZM16u2nqOm2+mm24F4DO+x9Co5CC" } }, { @@ -355,7 +371,8 @@ "languages": [ "eng" ], - "page_number": 2 + "page_number": 2, + "orig_elements": "eJxdUslu20AM/RVC50Ty1tTprUXPPRS+BYExnqEsNpoFHI4XGP73cmSnTXsRRlzee3zky6XBET0G2ZJrvkBjjVms5k/r1cIu1uvFpzn2s9XMzezs+QmX/bJ5gMajGGfEaP2lqY9tjoUtTv8J2VPOFEPe3oteLo2PrqaXy/l6fX1VjMJjpeuG6LHjEgJyd4z81pWQhYuVwuj+/RHMsv0Y2VLYa6xr2w5PxqcRH120uRtMcLsY3x7nqdXAqbkqYU8jyjlVFY1JaSRrREV2h+DamDCc/NhH9kbyY+x7sqidpRrTqiyXOFrUqcLej+17pnoxmrAvRmXUORsM+6ZOlzSyDcXvkDW+qPyCJ6ncmwHhmwlvXJLYM/zEygpfrUDsYf78eQ2MPh7QgWjl7m/lr+L2CD1HP2UYc1KPaUcjCWEGxQFnzhL1A8Z5CqRWTUNWaGsy5hbgO+4kcn4Aqw7S7al+KSaxg2R4AjuSDApyQM4IFASVTjJInLh1A1kQ4ahhsDFY5DAJNjLlrZ6DPMBxIDtAYjxQLHlUVSnFCuYmxlz0Vg6U76PeQbUtltFBiALkJz1mrL1ODahLw1pNDEz7QSWZfNdpuArXSW8ybmgtbKLmncrPtS//0atjT4CKPqn56NhBfS7BVutuVlD4fxv5rPD+5sCOej1/VebaehLvV/bD8A1sU1d/ff0N/Mc8Kg==" } }, { @@ -376,6 +393,7 @@ "eng" ], "page_number": 2, + "orig_elements": "eJxdUslu20AM/RVC50Ty1tTprUXPPRS+BYExnqEsNpoFHI4XGP73cmSnTXsRRlzee3zky6XBET0G2ZJrvkBjjVms5k/r1cIu1uvFpzn2s9XMzezs+QmX/bJ5gMajGGfEaP2lqY9tjoUtTv8J2VPOFEPe3oteLo2PrqaXy/l6fX1VjMJjpeuG6LHjEgJyd4z81pWQhYuVwuj+/RHMsv0Y2VLYa6xr2w5PxqcRH120uRtMcLsY3x7nqdXAqbkqYU8jyjlVFY1JaSRrREV2h+DamDCc/NhH9kbyY+x7sqidpRrTqiyXOFrUqcLej+17pnoxmrAvRmXUORsM+6ZOlzSyDcXvkDW+qPyCJ6ncmwHhmwlvXJLYM/zEygpfrUDsYf78eQ2MPh7QgWjl7m/lr+L2CD1HP2UYc1KPaUcjCWEGxQFnzhL1A8Z5CqRWTUNWaGsy5hbgO+4kcn4Aqw7S7al+KSaxg2R4AjuSDApyQM4IFASVTjJInLh1A1kQ4ahhsDFY5DAJNjLlrZ6DPMBxIDtAYjxQLHlUVSnFCuYmxlz0Vg6U76PeQbUtltFBiALkJz1mrL1ODahLw1pNDEz7QSWZfNdpuArXSW8ybmgtbKLmncrPtS//0atjT4CKPqn56NhBfS7BVutuVlD4fxv5rPD+5sCOej1/VebaehLvV/bD8A1sU1d/ff0N/Mc8Kg==", "is_continuation": true } }, @@ -397,6 +415,7 @@ "eng" ], "page_number": 2, + "orig_elements": "eJxdUslu20AM/RVC50Ty1tTprUXPPRS+BYExnqEsNpoFHI4XGP73cmSnTXsRRlzee3zky6XBET0G2ZJrvkBjjVms5k/r1cIu1uvFpzn2s9XMzezs+QmX/bJ5gMajGGfEaP2lqY9tjoUtTv8J2VPOFEPe3oteLo2PrqaXy/l6fX1VjMJjpeuG6LHjEgJyd4z81pWQhYuVwuj+/RHMsv0Y2VLYa6xr2w5PxqcRH120uRtMcLsY3x7nqdXAqbkqYU8jyjlVFY1JaSRrREV2h+DamDCc/NhH9kbyY+x7sqidpRrTqiyXOFrUqcLej+17pnoxmrAvRmXUORsM+6ZOlzSyDcXvkDW+qPyCJ6ncmwHhmwlvXJLYM/zEygpfrUDsYf78eQ2MPh7QgWjl7m/lr+L2CD1HP2UYc1KPaUcjCWEGxQFnzhL1A8Z5CqRWTUNWaGsy5hbgO+4kcn4Aqw7S7al+KSaxg2R4AjuSDApyQM4IFASVTjJInLh1A1kQ4ahhsDFY5DAJNjLlrZ6DPMBxIDtAYjxQLHlUVSnFCuYmxlz0Vg6U76PeQbUtltFBiALkJz1mrL1ODahLw1pNDEz7QSWZfNdpuArXSW8ybmgtbKLmncrPtS//0atjT4CKPqn56NhBfS7BVutuVlD4fxv5rPD+5sCOej1/VebaehLvV/bD8A1sU1d/ff0N/Mc8Kg==", "is_continuation": true } }, @@ -418,6 +437,7 @@ "eng" ], "page_number": 2, + "orig_elements": "eJxdUslu20AM/RVC50Ty1tTprUXPPRS+BYExnqEsNpoFHI4XGP73cmSnTXsRRlzee3zky6XBET0G2ZJrvkBjjVms5k/r1cIu1uvFpzn2s9XMzezs+QmX/bJ5gMajGGfEaP2lqY9tjoUtTv8J2VPOFEPe3oteLo2PrqaXy/l6fX1VjMJjpeuG6LHjEgJyd4z81pWQhYuVwuj+/RHMsv0Y2VLYa6xr2w5PxqcRH120uRtMcLsY3x7nqdXAqbkqYU8jyjlVFY1JaSRrREV2h+DamDCc/NhH9kbyY+x7sqidpRrTqiyXOFrUqcLej+17pnoxmrAvRmXUORsM+6ZOlzSyDcXvkDW+qPyCJ6ncmwHhmwlvXJLYM/zEygpfrUDsYf78eQ2MPh7QgWjl7m/lr+L2CD1HP2UYc1KPaUcjCWEGxQFnzhL1A8Z5CqRWTUNWaGsy5hbgO+4kcn4Aqw7S7al+KSaxg2R4AjuSDApyQM4IFASVTjJInLh1A1kQ4ahhsDFY5DAJNjLlrZ6DPMBxIDtAYjxQLHlUVSnFCuYmxlz0Vg6U76PeQbUtltFBiALkJz1mrL1ODahLw1pNDEz7QSWZfNdpuArXSW8ybmgtbKLmncrPtS//0atjT4CKPqn56NhBfS7BVutuVlD4fxv5rPD+5sCOej1/VebaehLvV/bD8A1sU1d/ff0N/Mc8Kg==", "is_continuation": true } }, @@ -438,7 +458,8 @@ "languages": [ "eng" ], - "page_number": 2 + "page_number": 2, + "orig_elements": "eJx1kk1r3DAQhv/K4HPXztpJnO2t0EMptBS6PaXBaK2xLWKNhDTaD5b97x15N9AUetN8SO/zzuj5XOCMFok7o4uPUNRN26iHVjebx1bdr1HV+vFu91C395u2bfq6+ACFRVZasZL+c5EPXXQp9LjEHoM1MRpHsbs1PZ8L63QuN8366enyIm+kMGe5anIWq5CIMFQHF16rRJFD6jkF1O8Dxsjd35nO0Ci5qiwrPCrrZ1xp18dqUqR3zr2u1r6UxLG4iOBgZuSTzxSF8n42vWKBrPakS+eRjnYeXLCK48oNg+lRbqY8mFKwtA+uR3FFo53Lt0qexaxoTEowss8CaSyyOy+ZjpLdYZB8nfUZj5y1vyk6gdLWkBEnwrBHGBL1GSZCRsAwn0DGuJw17E7AE0IvI2Y4YEDwsxI8OBieDC3Fz+hV4MwEboCvKbIYkEpwaZyu1wMufnM9xz+CG4OyJcAn62hccv+lUuJ8JJFktzT+IsMS/WQlO4FtED3EK9vyjvfO0BVHVgExiZu9iTf5flKeMcC6Ab5ejWUFsJ1MhC+31YGc5RclUUmkpXl5N/HkguHTPyZ+p/puvYmApHazrAiigCVBKyGv6G3r31W4OtvmVVxe/gCNZhA/" } }, { @@ -459,6 +480,7 @@ "eng" ], "page_number": 2, + "orig_elements": "eJx1kk1r3DAQhv/K4HPXztpJnO2t0EMptBS6PaXBaK2xLWKNhDTaD5b97x15N9AUetN8SO/zzuj5XOCMFok7o4uPUNRN26iHVjebx1bdr1HV+vFu91C395u2bfq6+ACFRVZasZL+c5EPXXQp9LjEHoM1MRpHsbs1PZ8L63QuN8366enyIm+kMGe5anIWq5CIMFQHF16rRJFD6jkF1O8Dxsjd35nO0Ci5qiwrPCrrZ1xp18dqUqR3zr2u1r6UxLG4iOBgZuSTzxSF8n42vWKBrPakS+eRjnYeXLCK48oNg+lRbqY8mFKwtA+uR3FFo53Lt0qexaxoTEowss8CaSyyOy+ZjpLdYZB8nfUZj5y1vyk6gdLWkBEnwrBHGBL1GSZCRsAwn0DGuJw17E7AE0IvI2Y4YEDwsxI8OBieDC3Fz+hV4MwEboCvKbIYkEpwaZyu1wMufnM9xz+CG4OyJcAn62hccv+lUuJ8JJFktzT+IsMS/WQlO4FtED3EK9vyjvfO0BVHVgExiZu9iTf5flKeMcC6Ab5ejWUFsJ1MhC+31YGc5RclUUmkpXl5N/HkguHTPyZ+p/puvYmApHazrAiigCVBKyGv6G3r31W4OtvmVVxe/gCNZhA/", "is_continuation": true } }, @@ -480,6 +502,7 @@ "eng" ], "page_number": 2, + "orig_elements": "eJx1kk1r3DAQhv/K4HPXztpJnO2t0EMptBS6PaXBaK2xLWKNhDTaD5b97x15N9AUetN8SO/zzuj5XOCMFok7o4uPUNRN26iHVjebx1bdr1HV+vFu91C395u2bfq6+ACFRVZasZL+c5EPXXQp9LjEHoM1MRpHsbs1PZ8L63QuN8366enyIm+kMGe5anIWq5CIMFQHF16rRJFD6jkF1O8Dxsjd35nO0Ci5qiwrPCrrZ1xp18dqUqR3zr2u1r6UxLG4iOBgZuSTzxSF8n42vWKBrPakS+eRjnYeXLCK48oNg+lRbqY8mFKwtA+uR3FFo53Lt0qexaxoTEowss8CaSyyOy+ZjpLdYZB8nfUZj5y1vyk6gdLWkBEnwrBHGBL1GSZCRsAwn0DGuJw17E7AE0IvI2Y4YEDwsxI8OBieDC3Fz+hV4MwEboCvKbIYkEpwaZyu1wMufnM9xz+CG4OyJcAn62hccv+lUuJ8JJFktzT+IsMS/WQlO4FtED3EK9vyjvfO0BVHVgExiZu9iTf5flKeMcC6Ab5ejWUFsJ1MhC+31YGc5RclUUmkpXl5N/HkguHTPyZ+p/puvYmApHazrAiigCVBKyGv6G3r31W4OtvmVVxe/gCNZhA/", "is_continuation": true } }, @@ -500,7 +523,8 @@ "languages": [ "eng" ], - "page_number": 2 + "page_number": 2, + "orig_elements": "eJxVkMtuwjAQRX8l8rrETYN4dIcKrdiABGZRAYqceBIs/JIfbSrEv9dGILU7z5kZ33tnf0EgQILyFWfoNUM1HY+GxYiNR9NhC209gaIZQ8nKZ0qnJRToKUMSPGXU0zh/QelROR1sA7fagJXcOa6Vq+5D+wuSmqV2WRaTyfUY/whWJDl80hKwDUqBxd/annFQztvQ+GCB/S88OF/9JRVXXWQ4zzH0VBoBA6Ybh09UsVrr86AweQQ9ukbBlgvwPya5QNQYwRvqo0n8pViuDaheilZbSb0b6LblDcTNkA6TR1vMWN1ATKU6KfJHJ91CUNUFGm2knAhUh1I6E0mlgqzBRv6S9D30Pmm/5Qe/JTOyI+vNZzbfkeVim63fs1kW6Wq+XH1kZLPbksXicBN4eCbcC0DX4y+F4ZOu" } }, { @@ -520,7 +544,8 @@ "languages": [ "eng" ], - "page_number": 2 + "page_number": 2, + "orig_elements": "eJx1Ustu2zAQ/JWFTi1gU5YdO1KvAQr00kPjnNJAoMilTZgPgQ/XhuF/71KtgaRwT6J2d2Znhny9VGjQoku9ltUXqETz0C02j12jOtysOsW7ZrNar1u5bAcpGlXNoLKYuOSJ0/ylKoc++hwETv8jBqtj1N7F/u/Q66WyXpb2atW07fWNOHIwZV299xbrkJ3DUP/y4VBnF1PIIuWA8uNPwpj695Veux3VasZqPHE7GpxLL2K9504O3h/mzciocKqutFBpg+k8FhUVH0ejBU8ksj46yfyI7mSN8sHyFOdeKS2QkLkEw0iWHIMXSK7czhp265QsDHe7zElG8Vmh21XF3UiV3mU7YKD6suxPeEpl93aPEBMpJC4gKzEhwp5H4KC0zELzcIaAcaQA9aCNTmdIHhLBBu4OIY9JnIFs84QM4C6djmB9QAJxR7wW6Sx1HHIoDoDEka3/YC19YECge4zecWPOoN3RmyNKOkw6bqMUW5hCJK5vamrdk+ID8Qm65wjZ8cFg8UPQEvcEkjlpahLyo/FS9IoM/Ms6u79rkq6tRakpHBLO5VFHnIZfnE7k4LnEFmH7B0G6ly28sGf2xOBnXiz4I6zbzafh86w0nthX9uNdgz1QhyQqytOJsvne0IqGWHkat9f2nYcS0xG35Qlc334DM0Qy5w==" } }, { @@ -541,6 +566,7 @@ "eng" ], "page_number": 2, + "orig_elements": "eJx1Ustu2zAQ/JWFTi1gU5YdO1KvAQr00kPjnNJAoMilTZgPgQ/XhuF/71KtgaRwT6J2d2Znhny9VGjQoku9ltUXqETz0C02j12jOtysOsW7ZrNar1u5bAcpGlXNoLKYuOSJ0/ylKoc++hwETv8jBqtj1N7F/u/Q66WyXpb2atW07fWNOHIwZV299xbrkJ3DUP/y4VBnF1PIIuWA8uNPwpj695Veux3VasZqPHE7GpxLL2K9504O3h/mzciocKqutFBpg+k8FhUVH0ejBU8ksj46yfyI7mSN8sHyFOdeKS2QkLkEw0iWHIMXSK7czhp265QsDHe7zElG8Vmh21XF3UiV3mU7YKD6suxPeEpl93aPEBMpJC4gKzEhwp5H4KC0zELzcIaAcaQA9aCNTmdIHhLBBu4OIY9JnIFs84QM4C6djmB9QAJxR7wW6Sx1HHIoDoDEka3/YC19YECge4zecWPOoN3RmyNKOkw6bqMUW5hCJK5vamrdk+ID8Qm65wjZ8cFg8UPQEvcEkjlpahLyo/FS9IoM/Ms6u79rkq6tRakpHBLO5VFHnIZfnE7k4LnEFmH7B0G6ly28sGf2xOBnXiz4I6zbzafh86w0nthX9uNdgz1QhyQqytOJsvne0IqGWHkat9f2nYcS0xG35Qlc334DM0Qy5w==", "is_continuation": true } }, @@ -562,6 +588,7 @@ "eng" ], "page_number": 2, + "orig_elements": "eJx1Ustu2zAQ/JWFTi1gU5YdO1KvAQr00kPjnNJAoMilTZgPgQ/XhuF/71KtgaRwT6J2d2Znhny9VGjQoku9ltUXqETz0C02j12jOtysOsW7ZrNar1u5bAcpGlXNoLKYuOSJ0/ylKoc++hwETv8jBqtj1N7F/u/Q66WyXpb2atW07fWNOHIwZV299xbrkJ3DUP/y4VBnF1PIIuWA8uNPwpj695Veux3VasZqPHE7GpxLL2K9504O3h/mzciocKqutFBpg+k8FhUVH0ejBU8ksj46yfyI7mSN8sHyFOdeKS2QkLkEw0iWHIMXSK7czhp265QsDHe7zElG8Vmh21XF3UiV3mU7YKD6suxPeEpl93aPEBMpJC4gKzEhwp5H4KC0zELzcIaAcaQA9aCNTmdIHhLBBu4OIY9JnIFs84QM4C6djmB9QAJxR7wW6Sx1HHIoDoDEka3/YC19YECge4zecWPOoN3RmyNKOkw6bqMUW5hCJK5vamrdk+ID8Qm65wjZ8cFg8UPQEvcEkjlpahLyo/FS9IoM/Ms6u79rkq6tRakpHBLO5VFHnIZfnE7k4LnEFmH7B0G6ly28sGf2xOBnXiz4I6zbzafh86w0nthX9uNdgz1QhyQqytOJsvne0IqGWHkat9f2nYcS0xG35Qlc334DM0Qy5w==", "is_continuation": true } }, @@ -583,6 +610,7 @@ "eng" ], "page_number": 2, + "orig_elements": "eJx1Ustu2zAQ/JWFTi1gU5YdO1KvAQr00kPjnNJAoMilTZgPgQ/XhuF/71KtgaRwT6J2d2Znhny9VGjQoku9ltUXqETz0C02j12jOtysOsW7ZrNar1u5bAcpGlXNoLKYuOSJ0/ylKoc++hwETv8jBqtj1N7F/u/Q66WyXpb2atW07fWNOHIwZV299xbrkJ3DUP/y4VBnF1PIIuWA8uNPwpj695Veux3VasZqPHE7GpxLL2K9504O3h/mzciocKqutFBpg+k8FhUVH0ejBU8ksj46yfyI7mSN8sHyFOdeKS2QkLkEw0iWHIMXSK7czhp265QsDHe7zElG8Vmh21XF3UiV3mU7YKD6suxPeEpl93aPEBMpJC4gKzEhwp5H4KC0zELzcIaAcaQA9aCNTmdIHhLBBu4OIY9JnIFs84QM4C6djmB9QAJxR7wW6Sx1HHIoDoDEka3/YC19YECge4zecWPOoN3RmyNKOkw6bqMUW5hCJK5vamrdk+ID8Qm65wjZ8cFg8UPQEvcEkjlpahLyo/FS9IoM/Ms6u79rkq6tRakpHBLO5VFHnIZfnE7k4LnEFmH7B0G6ly28sGf2xOBnXiz4I6zbzafh86w0nthX9uNdgz1QhyQqytOJsvne0IqGWHkat9f2nYcS0xG35Qlc334DM0Qy5w==", "is_continuation": true } }, @@ -603,7 +631,8 @@ "languages": [ "eng" ], - "page_number": 2 + "page_number": 2, + "orig_elements": "eJxtUbluGzEQ/ZXB1tLelmx3QZpUaeJUtrGgyFktYV4gh5YMQf+eoSIlDpBu7nfM86lCgxYdTVpVj1ANoxjGTddKOY7d3fwwD+pu3LZdL0Xfb3BTraCySEIJEjx/qkowJZ+jxEseMFqdkvYuTdeh51NlvSrtYeju78+vfCNHU+CaxVtsYnYOY3Pw8a3JLlHMknJE9W9CmGj6XJm023OtqesGj8IGg2vlZWoW4dTO+7d1F2ouHKszA87aIH2EwqISIRgtBTHJ5t2p2gd0R2tmH62gtPbzrCXyZi7G1ExLheglsiq3t6a+dYoXRrh9Fkyj6KzQ7auiLnBlctnuMHK9L/iERyrYXwwtPu8XoEUn+HalChw7T6AdoVOogDzsEARIX3QRQiJBmXz8gIgzRnQSV3zj0nCKiQH7kgjxJfdt95AgRG0Fj/9dVJk0JhCRl5CA5dLCiNB18LP+UX+t4SW3rdhCN7T9Cg6LlqUvfQw+CrYfdp/QIfHrwM8XEtfTnMlFBMII2xufxEDZqf8CbduxBnhaMP05wXgmK9a2y3ShWmwx2mq6uPJb8+yN8QcW/Vh+cHvrdxGZpn7Hp+L1+fUXTtD9sA==" } }, { @@ -624,6 +653,7 @@ "eng" ], "page_number": 2, + "orig_elements": "eJxtUbluGzEQ/ZXB1tLelmx3QZpUaeJUtrGgyFktYV4gh5YMQf+eoSIlDpBu7nfM86lCgxYdTVpVj1ANoxjGTddKOY7d3fwwD+pu3LZdL0Xfb3BTraCySEIJEjx/qkowJZ+jxEseMFqdkvYuTdeh51NlvSrtYeju78+vfCNHU+CaxVtsYnYOY3Pw8a3JLlHMknJE9W9CmGj6XJm023OtqesGj8IGg2vlZWoW4dTO+7d1F2ouHKszA87aIH2EwqISIRgtBTHJ5t2p2gd0R2tmH62gtPbzrCXyZi7G1ExLheglsiq3t6a+dYoXRrh9Fkyj6KzQ7auiLnBlctnuMHK9L/iERyrYXwwtPu8XoEUn+HalChw7T6AdoVOogDzsEARIX3QRQiJBmXz8gIgzRnQSV3zj0nCKiQH7kgjxJfdt95AgRG0Fj/9dVJk0JhCRl5CA5dLCiNB18LP+UX+t4SW3rdhCN7T9Cg6LlqUvfQw+CrYfdp/QIfHrwM8XEtfTnMlFBMII2xufxEDZqf8CbduxBnhaMP05wXgmK9a2y3ShWmwx2mq6uPJb8+yN8QcW/Vh+cHvrdxGZpn7Hp+L1+fUXTtD9sA==", "is_continuation": true } }, @@ -645,6 +675,7 @@ "eng" ], "page_number": 2, + "orig_elements": "eJxtUbluGzEQ/ZXB1tLelmx3QZpUaeJUtrGgyFktYV4gh5YMQf+eoSIlDpBu7nfM86lCgxYdTVpVj1ANoxjGTddKOY7d3fwwD+pu3LZdL0Xfb3BTraCySEIJEjx/qkowJZ+jxEseMFqdkvYuTdeh51NlvSrtYeju78+vfCNHU+CaxVtsYnYOY3Pw8a3JLlHMknJE9W9CmGj6XJm023OtqesGj8IGg2vlZWoW4dTO+7d1F2ouHKszA87aIH2EwqISIRgtBTHJ5t2p2gd0R2tmH62gtPbzrCXyZi7G1ExLheglsiq3t6a+dYoXRrh9Fkyj6KzQ7auiLnBlctnuMHK9L/iERyrYXwwtPu8XoEUn+HalChw7T6AdoVOogDzsEARIX3QRQiJBmXz8gIgzRnQSV3zj0nCKiQH7kgjxJfdt95AgRG0Fj/9dVJk0JhCRl5CA5dLCiNB18LP+UX+t4SW3rdhCN7T9Cg6LlqUvfQw+CrYfdp/QIfHrwM8XEtfTnMlFBMII2xufxEDZqf8CbduxBnhaMP05wXgmK9a2y3ShWmwx2mq6uPJb8+yN8QcW/Vh+cHvrdxGZpn7Hp+L1+fUXTtD9sA==", "is_continuation": true } }, @@ -664,7 +695,8 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": [ "eng" - ] + ], + "orig_elements": "eJxVUNFuwyAM/JWK5zUkS9UmfZ20n6iqiIKToAJGBrZEUf99UK3S9gQ+n+/OvmwMDFhwcdCKnXesU6M8ncTtUHc91L06tP0Jmve2PtbdUTY9e9sxC1EoEUXmb6x8hoCJJDxrD2R1CBpdGH5Jl41ZVKXdtk3XPa5ZI5EpdnxGC5ySc0D8G+nOkwuRkoyJQP0vIoQ4/EUG7aaM8arisAjrDewVysBn4dQN8b5vfJWBhT2y4agNxNWXFEx4b7QUMYfkX05V6MEt1oxIVsSwx3HUEvJkKoepcizlCSXkrdxkTfXqlFvMIBTQMCLG/LwMPGkraC0EI9yURM5ZDsHATexa4kRYYmF+oF9JT/NT7DX++VRjj+sPD/6QPg==" } } ] \ No newline at end of file diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 1a9406be93..002e7f91c0 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -321,6 +321,8 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata: This would generally be a dict formed using the `.to_dict()` method and stored as JSON before "rehydrating" it using this method. """ + from unstructured.staging.base import elements_from_base64_gzipped_json + # -- avoid unexpected mutation by working on a copy of provided dict -- meta_dict = copy.deepcopy(meta_dict) self = ElementMetadata() @@ -329,6 +331,8 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata: self.coordinates = CoordinatesMetadata.from_dict(field_value) elif field_name == "data_source": self.data_source = DataSourceMetadata.from_dict(field_value) + elif field_name == "orig_elements": + self.orig_elements = elements_from_base64_gzipped_json(field_value) else: setattr(self, field_name, field_value) @@ -372,15 +376,14 @@ def to_dict(self) -> dict[str, Any]: The returned dict is "sparse" in that no key-value pair appears for a field with value `None`. """ + from unstructured.staging.base import elements_to_base64_gzipped_json + meta_dict = copy.deepcopy(dict(self.fields)) # -- remove fields that should not be serialized -- for field_name in self.DEBUG_FIELD_NAMES: meta_dict.pop(field_name, None) - # -- remove `.orig_elements` for now as that won't serialize -- - meta_dict.pop("orig_elements", None) - # -- don't serialize empty lists -- meta_dict: dict[str, Any] = { field_name: value @@ -393,6 +396,8 @@ def to_dict(self) -> dict[str, Any]: meta_dict["coordinates"] = self.coordinates.to_dict() if self.data_source is not None: meta_dict["data_source"] = self.data_source.to_dict() + if self.orig_elements is not None: + meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements) return meta_dict diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index e7c3151e08..f79fa77567 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -1,8 +1,10 @@ from __future__ import annotations +import base64 import csv import io import json +import zlib from copy import deepcopy from datetime import datetime from typing import Any, Iterable, Optional, Sequence, cast @@ -21,7 +23,6 @@ if dependency_exists("pandas"): import pandas as pd - # ================================================================================================ # SERIALIZATION/DESERIALIZATION (SERDE) RELATED FUNCTIONS # ================================================================================================ @@ -32,6 +33,24 @@ # == DESERIALIZERS =============================== +def elements_from_base64_gzipped_json(b64_encoded_elements: str) -> list[Element]: + """Restore Base64-encoded gzipped JSON elements to element objects. + + This is used to when deserializing `ElementMetadata.orig_elements` from its compressed form in + JSON and dict forms and perhaps for other purposes. + """ + # -- Base64 str -> gzip-encoded (JSON) bytes -- + decoded_b64_bytes = base64.b64decode(b64_encoded_elements) + # -- undo gzip compression -- + elements_json_bytes = zlib.decompress(decoded_b64_bytes) + # -- JSON (bytes) to JSON (str) -- + elements_json_str = elements_json_bytes.decode("utf-8") + # -- JSON (str) -> dicts -- + element_dicts = json.loads(elements_json_str) + # -- dicts -> elements -- + return elements_from_dicts(element_dicts) + + def elements_from_dicts(element_dicts: Iterable[dict[str, Any]]) -> list[Element]: """Convert a list of element-dicts to a list of elements.""" elements: list[Element] = [] @@ -78,6 +97,28 @@ def elements_from_json( # == SERIALIZERS ================================= +def elements_to_base64_gzipped_json(elements: Iterable[Element]) -> str: + """Convert `elements` to Base64-encoded gzipped JSON. + + This is used to when serializing `ElementMetadata.orig_elements` to make it as compact as + possible when transported as JSON, for example in an HTTP response. This compressed form is also + present when elements are in dict form ("element_dicts"). This function is not coupled to that + purpose however and could have other uses. + """ + # -- adjust floating-point precision of coordinates down for a more compact str value -- + precision_adjusted_elements = _fix_metadata_field_precision(elements) + # -- serialize elements as dicts -- + element_dicts = elements_to_dicts(precision_adjusted_elements) + # -- serialize the dicts to JSON (bytes) -- + json_bytes = json.dumps(element_dicts, sort_keys=True).encode("utf-8") + # -- compress the JSON bytes with gzip compression -- + deflated_bytes = zlib.compress(json_bytes) + # -- base64-encode those bytes so they can be serialized as a JSON string value -- + b64_deflated_bytes = base64.b64encode(deflated_bytes) + # -- convert to a string suitable for serializing in JSON -- + return b64_deflated_bytes.decode("utf-8") + + def elements_to_dicts(elements: Iterable[Element]) -> list[dict[str, Any]]: """Convert document elements to element-dicts.""" return [e.to_dict() for e in elements]