diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3cb70c60cf..032575f0c7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -113,6 +113,7 @@ jobs: runs-on: ubuntu-latest env: NLTK_DATA: ${{ github.workspace }}/nltk_data + UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} needs: [setup, lint] steps: - uses: actions/checkout@v3 @@ -216,6 +217,7 @@ jobs: - name: Test env: UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | source .venv-${{ matrix.extra }}/bin/activate # NOTE(newelh) - determine what needs to be installed here @@ -425,5 +427,6 @@ jobs: run: | source .venv/bin/activate echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file + echo "UNSTRUCTURED_HF_TOKEN=${{ secrets.HF_TOKEN }}" > uns_test_env_file make docker-build make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true diff --git a/CHANGELOG.md b/CHANGELOG.md index ca337ff4cb..3eafc3a721 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ### Enhancements +* **bump `unstructured-inference` to `0.7.3`** The updated version of `unstructured-inference` supports a new version of the Chipper model, as well as a cleaner schema for its output classes. Support is included for new inference features such as hierarchy and ordering. * **Expose skip_infer_table_types in ingest CLI.** For each connector a new `--skip-infer-table-types` parameter was added to map to the `skip_infer_table_types` partition argument. This gives more granular control to unstructured-ingest users, allowing them to specify the file types for which we should attempt table extraction. * **Add flag to ingest CLI to raise error if any single doc fails in pipeline** Currently if a single doc fails in the pipeline, the whole thing halts due to the error. This flag defaults to log an error but continue with the docs it can. diff --git a/requirements/dev.txt b/requirements/dev.txt index b4bdb78e2d..c94726debc 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -144,7 +144,7 @@ jsonschema-specifications==2023.7.1 # via jsonschema jupyter==1.0.0 # via -r requirements/dev.in -jupyter-client==8.3.1 +jupyter-client==8.4.0 # via # ipykernel # jupyter-console @@ -153,7 +153,7 @@ jupyter-client==8.3.1 # qtconsole jupyter-console==6.6.3 # via jupyter -jupyter-core==5.3.2 +jupyter-core==5.4.0 # via # -c requirements/constraints.in # ipykernel @@ -178,7 +178,7 @@ jupyter-server==2.7.3 # notebook-shim jupyter-server-terminals==0.4.4 # via jupyter-server -jupyterlab==4.0.6 +jupyterlab==4.0.7 # via notebook jupyterlab-pygments==0.2.2 # via nbconvert @@ -213,7 +213,7 @@ nest-asyncio==1.5.8 # via ipykernel nodeenv==1.8.0 # via pre-commit -notebook==7.0.4 +notebook==7.0.5 # via jupyter notebook-shim==0.2.3 # via @@ -320,7 +320,7 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events -rpds-py==0.10.4 +rpds-py==0.10.6 # via # jsonschema # referencing diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 7e83c2bdac..1f26260ee1 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -8,5 +8,9 @@ lxml==4.9.3 # via # -c requirements/base.txt # python-docx -python-docx==0.8.11 +python-docx==1.0.0 # via -r requirements/extra-docx.in +typing-extensions==4.8.0 + # via + # -c requirements/base.txt + # python-docx diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index ea84eaa7c6..5b6d1969a1 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -10,5 +10,9 @@ lxml==4.9.3 # python-docx pypandoc==1.11 # via -r requirements/extra-odt.in -python-docx==0.8.11 +python-docx==1.0.0 # via -r requirements/extra-odt.in +typing-extensions==4.8.0 + # via + # -c requirements/base.txt + # python-docx diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index bde6ee8e54..ecc6c77202 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -31,7 +31,7 @@ contourpy==1.1.1 # via matplotlib cssselect==1.2.0 # via premailer -cssutils==2.7.1 +cssutils==2.8.0 # via premailer cycler==0.12.1 # via matplotlib diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index f8fe0265cb..caec093f9d 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -5,7 +5,7 @@ pdf2image pdfminer.six # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.7.2 +unstructured-inference==0.7.3 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats # from one tesseract call unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index cadefa07b7..61cb8b19b1 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -149,7 +149,7 @@ pyparsing==3.0.9 # via # -c requirements/constraints.in # matplotlib -pypdfium2==4.20.0 +pypdfium2==4.21.0 # via pdfplumber pytesseract==0.3.10 # via layoutparser @@ -231,7 +231,7 @@ typing-extensions==4.8.0 # torch tzdata==2023.3 # via pandas -unstructured-inference==0.7.2 +unstructured-inference==0.7.3 # via -r requirements/extra-pdf-image.in unstructured-pytesseract==0.3.12 # via diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index 85421ffada..d4e7acc4fd 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -23,7 +23,7 @@ azure-datalake-store==0.0.53 # via adlfs azure-identity==1.14.1 # via adlfs -azure-storage-blob==12.18.2 +azure-storage-blob==12.18.3 # via adlfs certifi==2023.7.22 # via diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index 8dd592c453..00879df44e 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -64,7 +64,7 @@ google-crc32c==1.5.0 # via google-resumable-media google-resumable-media==2.6.0 # via google-cloud-storage -googleapis-common-protos==1.60.0 +googleapis-common-protos==1.61.0 # via google-api-core idna==3.4 # via diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index 6b21ef778d..b11ae57296 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -17,7 +17,7 @@ charset-normalizer==3.3.0 # requests google-api-core==2.12.0 # via google-api-python-client -google-api-python-client==2.102.0 +google-api-python-client==2.103.0 # via -r requirements/ingest-google-drive.in google-auth==2.23.3 # via @@ -26,7 +26,7 @@ google-auth==2.23.3 # google-auth-httplib2 google-auth-httplib2==0.1.1 # via google-api-python-client -googleapis-common-protos==1.60.0 +googleapis-common-protos==1.61.0 # via google-api-core httplib2==0.22.0 # via diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt index a6c168cb4e..10b11ead5a 100644 --- a/requirements/ingest-openai.txt +++ b/requirements/ingest-openai.txt @@ -50,7 +50,7 @@ jsonpatch==1.33 # via langchain jsonpointer==2.4 # via jsonpatch -langchain==0.0.311 +langchain==0.0.313 # via -r requirements/ingest-openai.in langsmith==0.0.43 # via langchain diff --git a/requirements/test.txt b/requirements/test.txt index 0b519e9cb3..005a1c0c5c 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -54,7 +54,7 @@ mccabe==0.7.0 # via flake8 multidict==6.0.4 # via yarl -mypy==1.5.1 +mypy==1.6.0 # via -r requirements/test.in mypy-extensions==1.0.0 # via diff --git a/test_unstructured/partition/pdf_image/test_image.py b/test_unstructured/partition/pdf_image/test_image.py index 0e86162f33..82ca45a3de 100644 --- a/test_unstructured/partition/pdf_image/test_image.py +++ b/test_unstructured/partition/pdf_image/test_image.py @@ -62,7 +62,7 @@ def __init__(self, number: int, image: Image): @property def elements(self): return [ - layout.LayoutElement( + layout.LayoutElement.from_coords( type="Title", x1=0, y1=0, diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index fd1cfc74ef..139de9fb71 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -80,9 +80,9 @@ def test_get_ocr_layout_from_image_tesseract(monkeypatch): ) expected_layout = [ - TextRegion(10, 5, 25, 15, "Hello", source="OCR-tesseract"), - TextRegion(20, 15, 45, 35, "World", source="OCR-tesseract"), - TextRegion(30, 25, 65, 55, "!", source="OCR-tesseract"), + TextRegion.from_coords(10, 5, 25, 15, "Hello", source="OCR-tesseract"), + TextRegion.from_coords(20, 15, 45, 35, "World", source="OCR-tesseract"), + TextRegion.from_coords(30, 25, 65, 55, "!", source="OCR-tesseract"), ] assert ocr_layout == expected_layout @@ -131,9 +131,9 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch): ocr_layout = ocr.get_ocr_layout_from_image(image, ocr_languages="eng", entire_page_ocr="paddle") expected_layout = [ - TextRegion(10, 5, 25, 15, "Hello", source="OCR-paddle"), - TextRegion(20, 15, 45, 35, "World", source="OCR-paddle"), - TextRegion(30, 25, 65, 55, "!", source="OCR-paddle"), + TextRegion.from_coords(10, 5, 25, 15, "Hello", source="OCR-paddle"), + TextRegion.from_coords(20, 15, 45, 35, "World", source="OCR-paddle"), + TextRegion.from_coords(30, 25, 65, 55, "!", source="OCR-paddle"), ] assert ocr_layout == expected_layout @@ -169,9 +169,9 @@ def test_get_ocr_text_from_image_paddle(monkeypatch): @pytest.fixture() def mock_ocr_regions(): return [ - EmbeddedTextRegion(10, 10, 90, 90, text="0", source=None), - EmbeddedTextRegion(200, 200, 300, 300, text="1", source=None), - EmbeddedTextRegion(500, 320, 600, 350, text="3", source=None), + EmbeddedTextRegion.from_coords(10, 10, 90, 90, text="0", source=None), + EmbeddedTextRegion.from_coords(200, 200, 300, 300, text="1", source=None), + EmbeddedTextRegion.from_coords(500, 320, 600, 350, text="3", source=None), ] @@ -179,13 +179,10 @@ def mock_ocr_regions(): def mock_out_layout(mock_embedded_text_regions): return [ LayoutElement( - r.x1, - r.y1, - r.x2, - r.y2, text=None, source=None, type="Text", + bbox=r.bbox, ) for r in mock_embedded_text_regions ] @@ -194,19 +191,19 @@ def mock_out_layout(mock_embedded_text_regions): def test_aggregate_ocr_text_by_block(): expected = "A Unified Toolkit" ocr_layout = [ - TextRegion(0, 0, 20, 20, "A"), - TextRegion(50, 50, 150, 150, "Unified"), - TextRegion(150, 150, 300, 250, "Toolkit"), - TextRegion(200, 250, 300, 350, "Deep"), + TextRegion.from_coords(0, 0, 20, 20, "A"), + TextRegion.from_coords(50, 50, 150, 150, "Unified"), + TextRegion.from_coords(150, 150, 300, 250, "Toolkit"), + TextRegion.from_coords(200, 250, 300, 350, "Deep"), ] - region = TextRegion(0, 0, 250, 350, "") + region = TextRegion.from_coords(0, 0, 250, 350, "") text = ocr.aggregate_ocr_text_by_block(ocr_layout, region, 0.5) assert text == expected def test_merge_text_regions(mock_embedded_text_regions): - expected = TextRegion( + expected = TextRegion.from_coords( x1=437.83888888888885, y1=317.319341111111, x2=1256.334784222222, @@ -220,7 +217,7 @@ def test_merge_text_regions(mock_embedded_text_regions): def test_get_elements_from_ocr_regions(mock_embedded_text_regions): expected = [ - LayoutElement( + LayoutElement.from_coords( x1=437.83888888888885, y1=317.319341111111, x2=1256.334784222222, @@ -237,14 +234,7 @@ def test_get_elements_from_ocr_regions(mock_embedded_text_regions): @pytest.fixture() def mock_layout(mock_embedded_text_regions): return [ - LayoutElement( - r.x1, - r.y1, - r.x2, - r.y2, - text=r.text, - type="UncategorizedText", - ) + LayoutElement(text=r.text, type="UncategorizedText", bbox=r.bbox) for r in mock_embedded_text_regions ] @@ -252,70 +242,70 @@ def mock_layout(mock_embedded_text_regions): @pytest.fixture() def mock_embedded_text_regions(): return [ - EmbeddedTextRegion( + EmbeddedTextRegion.from_coords( x1=453.00277777777774, y1=317.319341111111, x2=711.5338541666665, y2=358.28571222222206, text="LayoutParser:", ), - EmbeddedTextRegion( + EmbeddedTextRegion.from_coords( x1=726.4778125, y1=317.319341111111, x2=760.3308594444444, y2=357.1698966666667, text="A", ), - EmbeddedTextRegion( + EmbeddedTextRegion.from_coords( x1=775.2748177777777, y1=317.319341111111, x2=917.3579885555555, y2=357.1698966666667, text="Unified", ), - EmbeddedTextRegion( + EmbeddedTextRegion.from_coords( x1=932.3019468888888, y1=317.319341111111, x2=1071.8426522222221, y2=357.1698966666667, text="Toolkit", ), - EmbeddedTextRegion( + EmbeddedTextRegion.from_coords( x1=1086.7866105555556, y1=317.319341111111, x2=1141.2105142777777, y2=357.1698966666667, text="for", ), - EmbeddedTextRegion( + EmbeddedTextRegion.from_coords( x1=1156.154472611111, y1=317.319341111111, x2=1256.334784222222, y2=357.1698966666667, text="Deep", ), - EmbeddedTextRegion( + EmbeddedTextRegion.from_coords( x1=437.83888888888885, y1=367.13322999999986, x2=610.0171992222222, y2=406.9837855555556, text="Learning", ), - EmbeddedTextRegion( + EmbeddedTextRegion.from_coords( x1=624.9611575555555, y1=367.13322999999986, x2=741.6754646666665, y2=406.9837855555556, text="Based", ), - EmbeddedTextRegion( + EmbeddedTextRegion.from_coords( x1=756.619423, y1=367.13322999999986, x2=958.3867708333332, y2=406.9837855555556, text="Document", ), - EmbeddedTextRegion( + EmbeddedTextRegion.from_coords( x1=973.3307291666665, y1=367.13322999999986, x2=1092.0535042777776, @@ -327,15 +317,7 @@ def mock_embedded_text_regions(): def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): ocr_elements = [ - LayoutElement( - r.x1, - r.y1, - r.x2, - r.y2, - text=r.text, - source=None, - type="UncategorizedText", - ) + LayoutElement(text=r.text, source=None, type="UncategorizedText", bbox=r.bbox) for r in mock_ocr_regions ] @@ -351,21 +333,15 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): # Check if the OCR-derived elements that are subregions of layout elements are removed for element in mock_layout: for ocr_element in ocr_elements: - if ocr_element.is_almost_subregion_of(element, ocr.SUBREGION_THRESHOLD_FOR_OCR): + if ocr_element.bbox.is_almost_subregion_of( + element.bbox, ocr.SUBREGION_THRESHOLD_FOR_OCR + ): assert ocr_element not in final_layout def test_merge_out_layout_with_ocr_layout(mock_out_layout, mock_ocr_regions): ocr_elements = [ - LayoutElement( - r.x1, - r.y1, - r.x2, - r.y2, - text=r.text, - source=None, - type="UncategorizedText", - ) + LayoutElement(text=r.text, source=None, type="UncategorizedText", bbox=r.bbox) for r in mock_ocr_regions ] @@ -389,7 +365,7 @@ def test_merge_out_layout_with_ocr_layout(mock_out_layout, mock_ocr_regions): ], ) def test_pad_element_bboxes(padding, expected_bbox): - element = LayoutElement( + element = LayoutElement.from_coords( x1=10, y1=20, x2=30, @@ -403,13 +379,13 @@ def test_pad_element_bboxes(padding, expected_bbox): padded_element = pad_element_bboxes(element, padding) padded_element_bbox = ( - padded_element.x1, - padded_element.y1, - padded_element.x2, - padded_element.y2, + padded_element.bbox.x1, + padded_element.bbox.y1, + padded_element.bbox.x2, + padded_element.bbox.y2, ) assert padded_element_bbox == expected_bbox # make sure the original element has not changed - original_element_bbox = (element.x1, element.y1, element.x2, element.y2) + original_element_bbox = (element.bbox.x1, element.bbox.y1, element.bbox.x2, element.bbox.y2) assert original_element_bbox == expected_original_element_bbox diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index b52e73ec27..f38dfd85d0 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -68,7 +68,7 @@ def __init__(self, number: int, image: Image): @property def elements(self): return [ - layout.LayoutElement( + layout.LayoutElement.from_coords( type="Title", x1=0, y1=0, @@ -994,3 +994,27 @@ def test_check_annotations_within_element(threshold, expected): filtered = pdf.check_annotations_within_element(annotations, element_bbox, 1, threshold) results = [annotation in filtered for annotation in annotations] assert results == expected + + +@pytest.fixture(scope="session") +def chipper_results(): + elements = pdf.partition_pdf( + "example-docs/layout-parser-paper-fast.pdf", strategy="hi_res", model_name="chipper" + ) + return elements + + +@pytest.fixture(scope="session") +def chipper_children(chipper_results): + return [el for el in chipper_results if el.metadata.parent_id is not None] + + +def test_chipper_has_hierarchy(chipper_children): + assert chipper_children + + +def test_chipper_not_losing_parents(chipper_results, chipper_children): + assert all( + [el for el in chipper_results if el.id == child.metadata.parent_id] + for child in chipper_children + ) diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py index e9751d0f3d..6429229b9d 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/test_common.py @@ -1,8 +1,10 @@ +from dataclasses import dataclass +from unittest import mock + import pytest from PIL import Image from unstructured_inference.inference import layout -from unstructured_inference.inference.layout import LayoutElement -from unstructured_inference.inference.layoutelement import LocationlessLayoutElement +from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout from unstructured.documents.coordinates import PixelSpace from unstructured.documents.elements import ( @@ -21,6 +23,7 @@ contains_emoji, document_to_element_list, ) +from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT class MockPageLayout(layout.PageLayout): @@ -31,21 +34,25 @@ def __init__(self, number: int, image: Image): @property def elements(self): return [ - LocationlessLayoutElement( + LayoutElement( type="Headline", text="Charlie Brown and the Great Pumpkin", + bbox=None, ), - LocationlessLayoutElement( + LayoutElement( type="Subheadline", text="The Beginning", + bbox=None, ), - LocationlessLayoutElement( + LayoutElement( type="Text", text="This time Charlie Brown had it really tricky...", + bbox=None, ), - LocationlessLayoutElement( + LayoutElement( type="Title", text="Another book title in the same page", + bbox=None, ), ] @@ -153,7 +160,7 @@ def test_normalize_layout_element_dict_misc(): def test_normalize_layout_element_layout_element(): - layout_element = LayoutElement( + layout_element = LayoutElement.from_coords( type="Text", x1=1, y1=2, @@ -174,7 +181,7 @@ def test_normalize_layout_element_layout_element(): def test_normalize_layout_element_layout_element_narrative_text(): - layout_element = LayoutElement( + layout_element = LayoutElement.from_coords( type="NarrativeText", x1=1, y1=2, @@ -195,7 +202,7 @@ def test_normalize_layout_element_layout_element_narrative_text(): def test_normalize_layout_element_checked_box(): - layout_element = LayoutElement( + layout_element = LayoutElement.from_coords( type="Checked", x1=1, y1=2, @@ -216,7 +223,7 @@ def test_normalize_layout_element_checked_box(): def test_normalize_layout_element_unchecked_box(): - layout_element = LayoutElement( + layout_element = LayoutElement.from_coords( type="Unchecked", x1=1, y1=2, @@ -237,7 +244,7 @@ def test_normalize_layout_element_unchecked_box(): def test_normalize_layout_element_enumerated_list(): - layout_element = LayoutElement( + layout_element = LayoutElement.from_coords( type="List", x1=1, y1=2, @@ -270,7 +277,7 @@ def test_normalize_layout_element_enumerated_list(): def test_normalize_layout_element_bulleted_list(): - layout_element = LayoutElement( + layout_element = LayoutElement.from_coords( type="List", x1=1, y1=2, @@ -342,6 +349,9 @@ def test_contains_emoji(text, expected): def test_document_to_element_list_omits_coord_system_when_coord_points_absent(): layout_elem_absent_coordinates = MockDocumentLayout() + for page in layout_elem_absent_coordinates.pages: + for el in page.elements: + el.bbox = None elements = document_to_element_list(layout_elem_absent_coordinates) assert elements[0].metadata.coordinates is None @@ -419,6 +429,62 @@ def test_set_element_hierarchy_custom_rule_set(): ), "FigureCaption should be child of Title 2" +@dataclass +class MockImage: + width = 640 + height = 480 + format = "JPG" + + +def test_document_to_element_list_handles_parent(): + block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText") + block2 = LayoutElement.from_coords( + 1, + 2, + 3, + 4, + text="block 2", + parent=block1, + type="NarrativeText", + ) + page = PageLayout( + number=1, + image=MockImage(), + layout=None, + ) + page.elements = [block1, block2] + doc = DocumentLayout.from_pages([page]) + el1, el2 = document_to_element_list(doc) + assert el2.metadata.parent_id == el1.id + + +@pytest.mark.parametrize( + ("sort_mode", "call_count"), + [(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)], +) +def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count): + block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText") + block2 = LayoutElement.from_coords( + 1, + 2, + 3, + 4, + text="block 2", + parent=block1, + type="NarrativeText", + ) + page = PageLayout( + number=1, + image=MockImage(), + layout=None, + ) + page.elements = [block1, block2] + doc = DocumentLayout.from_pages([page]) + with mock.patch.object(common, "sort_page_elements") as mock_sort_page_elements: + document_to_element_list(doc, sortable=True, sort_mode=sort_mode) + assert mock_sort_page_elements.call_count == call_count + + def test_document_to_element_list_sets_category_depth_titles(): layout_with_hierarchies = MockDocumentLayout() elements = document_to_element_list(layout_with_hierarchies) diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py index 1125903408..0f4bd150eb 100644 --- a/test_unstructured/test_utils.py +++ b/test_unstructured/test_utils.py @@ -82,3 +82,31 @@ def __init__(self): import numpy # noqa: F401 TestClass() + + +@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10), [0], (0,), range(1)]) +def test_first_gives_first(iterator): + assert utils.first(iterator) == 0 + + +@pytest.mark.parametrize("iterator", [[], ()]) +def test_first_raises_if_empty(iterator): + with pytest.raises(ValueError): + utils.first(iterator) + + +@pytest.mark.parametrize("iterator", [[0], (0,), range(1)]) +def test_only_gives_only(iterator): + assert utils.first(iterator) == 0 + + +@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10)]) +def test_only_raises_when_len_more_than_1(iterator): + with pytest.raises(ValueError): + utils.only(iterator) == 0 + + +@pytest.mark.parametrize("iterator", [[], ()]) +def test_only_raises_if_empty(iterator): + with pytest.raises(ValueError): + utils.only(iterator) diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index c5ea0bc996..2ae814685f 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -1,7 +1,7 @@ [ { "type": "Title", - "element_id": "0b8804afbc4722108e877480e28462a6", + "element_id": "a683b44cbad86146f841c74d46c6425f", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -20,7 +20,7 @@ }, { "type": "NarrativeText", - "element_id": "46b1e4dae5ffd7cdcb2a6ed9f206a8ee", + "element_id": "af16ed1a4b463932f7a4084004540226", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -58,7 +58,7 @@ }, { "type": "NarrativeText", - "element_id": "d9644fb4b85468d186b132c91ca64f31", + "element_id": "0480125483856ec593d0dc47efed26a7", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -77,7 +77,7 @@ }, { "type": "Title", - "element_id": "c8e51fdc53c202393adad77f7f93ee5a", + "element_id": "5af85d9428f02bd89d9adcbafcddd56b", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -96,7 +96,7 @@ }, { "type": "NarrativeText", - "element_id": "d6df9cd66da09d30c16d194e877766ca", + "element_id": "69210a48a5d82a5d17035b355ffa007f", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -115,7 +115,7 @@ }, { "type": "ListItem", - "element_id": "04ff84b51fab69c07381ac794b740243", + "element_id": "f490ac00003fcf9c8e2048b7f84c7e03", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -134,7 +134,7 @@ }, { "type": "ListItem", - "element_id": "eca1ce0fb28f9aee393eb53e1d63b30e", + "element_id": "ff686e6046cd8176988d8dec0d8adac4", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -153,7 +153,7 @@ }, { "type": "ListItem", - "element_id": "8b02f539eb8ccee5b3fc24f66858188c", + "element_id": "a8fefd2c51a2e783c60f4ad2947b8dc7", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -172,7 +172,7 @@ }, { "type": "ListItem", - "element_id": "469e981f34d1e6f2b420574ed8e932d2", + "element_id": "ba90804d766fc25305a415a9568ed86f", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -191,7 +191,7 @@ }, { "type": "ListItem", - "element_id": "4b8fc76cbba0e2fef79ff8bc668b1401", + "element_id": "d00f9c6e8d4992d5fffe2bfca4c5a61b", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -210,7 +210,7 @@ }, { "type": "NarrativeText", - "element_id": "69da7754428f154ee3b2906214d31ad9", + "element_id": "38bfc67c105fb76aa426f4d7c1045205", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -229,7 +229,7 @@ }, { "type": "Title", - "element_id": "37486ef32cbf05082d5dbff0581db762", + "element_id": "ea81396d7186842db62efca0a0b24158", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -248,7 +248,7 @@ }, { "type": "NarrativeText", - "element_id": "cfe4cc76625dc82267d95ec1dc7e7813", + "element_id": "eb990c6ddce9ef7d085d006a519f2ac4", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -286,7 +286,7 @@ }, { "type": "NarrativeText", - "element_id": "edd5f2f5a60a83c8899e533ac8bcd03c", + "element_id": "4d6d4f9d658c0c40e219af1bd65d6b85", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -305,7 +305,7 @@ }, { "type": "Title", - "element_id": "3c36cd10b2e64b9f2169f05abddd4981", + "element_id": "521a8660cf9df4a6eb675601e0378dc1", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -324,7 +324,7 @@ }, { "type": "NarrativeText", - "element_id": "987542acede56f098db655f02fb814a7", + "element_id": "f3953a4b1aa6bf42305393694fb4e428", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -343,7 +343,7 @@ }, { "type": "ListItem", - "element_id": "2e3cec7bff1e8c8d8e0087f0bcfa89f0", + "element_id": "79b682d97de5a63ee664ff7d1e5c4ea9", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -362,7 +362,7 @@ }, { "type": "ListItem", - "element_id": "c6865d507571ccb14d37791134f27f61", + "element_id": "91ae53948262afff2ea72e0d3bc5cb98", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -381,7 +381,7 @@ }, { "type": "ListItem", - "element_id": "3f14cc0782485365bad0539f7b1bbb22", + "element_id": "393bc42351625088ac1ac8f75ee89400", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -400,7 +400,7 @@ }, { "type": "NarrativeText", - "element_id": "c2e95867ed0f25e3d9fe1a6b97447ab9", + "element_id": "294abbac3c259144af3228a7fa06a3e2", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 0f56268d18..0667825511 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -1,7 +1,7 @@ [ { "type": "Header", - "element_id": "c1f4b5ba045830c1866db8f8aa0b54ac", + "element_id": "fe4f2822da5bf5f7cae18c9d0081d312", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -11,7 +11,7 @@ }, { "type": "NarrativeText", - "element_id": "869adddb184177031536477262e0dde0", + "element_id": "721bbccb8e92736ad5776ad812dc8bca", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -21,7 +21,7 @@ }, { "type": "Title", - "element_id": "e6fa42b5b4d85001b900e47c050b645b", + "element_id": "01a8470646d06d1c29b9f43915ac04e0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -31,7 +31,7 @@ }, { "type": "NarrativeText", - "element_id": "9234133787d0a6b3976b16569c0b5cf3", + "element_id": "aefc5298031dc9a700e66da55852c6b8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -41,7 +41,7 @@ }, { "type": "NarrativeText", - "element_id": "ac01687ab870e4bb6e7313db4654928a", + "element_id": "8bdb2c4b6997a1bbd7b948d7aeb402e2", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -51,7 +51,7 @@ }, { "type": "Title", - "element_id": "9ce2527454e3b72c1ba73e179779361d", + "element_id": "fcac8777e4bfd43297bbdb237a318b7f", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -71,7 +71,7 @@ }, { "type": "Title", - "element_id": "4f14d967ea87a75ad1acee27ff34e59e", + "element_id": "6d1999c49562bd7c2b15a41327b8fc36", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -81,7 +81,7 @@ }, { "type": "NarrativeText", - "element_id": "cb64167b76eb9bc1d0dc4771969a3724", + "element_id": "53b670b88090fce85714890f1696853b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -131,7 +131,7 @@ }, { "type": "NarrativeText", - "element_id": "4a03002c97925cd9397927ac823369e7", + "element_id": "ec72ddc98d060198a58854666bc2af39", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -141,7 +141,7 @@ }, { "type": "NarrativeText", - "element_id": "08bb309957586c280660c11c337dc6d7", + "element_id": "0b4c9205b52f320b7a727a310d1b9c61", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -151,7 +151,7 @@ }, { "type": "NarrativeText", - "element_id": "26c73759c3d3cc29d683910c034432da", + "element_id": "11cb61ac68da3f1039df950c887ac3db", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -181,7 +181,7 @@ }, { "type": "Title", - "element_id": "5abba9b1f2c341e0b299fa43a90d0e14", + "element_id": "c965cb5900649ea7abbf36e0d6a13547", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -191,7 +191,7 @@ }, { "type": "NarrativeText", - "element_id": "ac89a2886224c42ad15982cd34421ff8", + "element_id": "a497ec33825f1793d0403f872220ffa5", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -211,7 +211,7 @@ }, { "type": "ListItem", - "element_id": "e102dc7c1db28c29d5e4bde8062592ed", + "element_id": "1c2879aacaaeedb75720c6e9f57edf67", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -231,7 +231,7 @@ }, { "type": "Header", - "element_id": "78f135d64d5e1307cac651608256a418", + "element_id": "722bbe57a365b0aec3839d5228ca8c62", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -251,13 +251,13 @@ }, { "type": "Table", - "element_id": "6911009421d6126fc96a193e8e7b8c87", + "element_id": "82391aed75376c2c3bc734ad52ec73e4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "How data were acquired The cleaned and weighed specimen was suspended in beakers con-taining 0.5 M H2SO4 solution of different concentrations of egg shellpowder. The pre-weighed stainless steel samples were retrieved fromthe test solutions after every 24 h, cleaned appropriately, dried andreweighed.Raw, analyzedThe difference between the weight at a given time and the initialweight of the specimen was taken as the weight loss, which was usedto calculate the corrosion rate and inhibition efficiency.Inhibitor concentration, exposure timeDepartment of Chemical, Metallurgical and Materials Engineering,Tshwane University of Technology, Pretoria, South AfricaData are available within this articleO. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosionresistance of stainless steel type 316 in sulphuric acid solution usingeco-friendly waste product, Results in Physics, 9 (2018) 225–230. Data formatExperimental factors Experimental featuresData source location AccessibilityRelated research article" + "text": "How data were acquired The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225–230. Data format Experimental factors Experimental features Data source location Accessibility Related research article" }, { "type": "NarrativeText", @@ -311,7 +311,7 @@ }, { "type": "Title", - "element_id": "e63f0ed399f0537c9ffeadfcae3baed6", + "element_id": "12edc7514919a752d95d3b8e626e45ca", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -321,7 +321,7 @@ }, { "type": "ListItem", - "element_id": "1daeb29ccbc793481f453c7f76b8795b", + "element_id": "eea806a9ad33194f383205ef0cce7774", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -331,7 +331,7 @@ }, { "type": "ListItem", - "element_id": "7aad924d1c00e3d50bc0c24beb00a9e5", + "element_id": "db51aa8084450acf421ebcc239afeaac", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -351,7 +351,7 @@ }, { "type": "ListItem", - "element_id": "1ddde62c3188f81dfc835b6f036f1734", + "element_id": "1c4e65698ec8b13c8ea699224fba15c2", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -361,7 +361,7 @@ }, { "type": "Title", - "element_id": "1c3f3de4e65aae5bd147f84779712a65", + "element_id": "869087d0a7770d68ff7b67de54507c6d", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -371,7 +371,7 @@ }, { "type": "NarrativeText", - "element_id": "5034c7315aface0b263361d0eae1dd15", + "element_id": "59396eaa9450c0f5c87414c88588995c", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -381,13 +381,13 @@ }, { "type": "Image", - "element_id": "3dd23b04172eaa4ac70b822fde1d6569", + "element_id": "ba559032c2f9f98c24e4c547af135b8e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "30 10g8g6g4g2gControl )gm ( sso 20 l thgeW i 10 48 96 144 192" + "text": "30 10g 8g 6g 4g 2g Control ) g m ( s s o 20 l t h g e W i 10 48 96 144 192 " }, { "type": "Title", @@ -401,17 +401,17 @@ }, { "type": "FigureCaption", - "element_id": "f5289d20374c576627b200df3b4e5a85", + "element_id": "7e776fc583357f98358877d4c0b48b7f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Fig. 1. Weight loss versus exposure time for stainless steelpresence of ES. immersed in 0.5 M H2SO4 solution in the absence and" + "text": "Fig. 1. Weight loss versus exposure time for stainless steel presence of ES. immersed in 0.5 M H2SO4 solution in the absence and" }, { "type": "NarrativeText", - "element_id": "8a54dcaa0e2720786903e26e84bd9e93", + "element_id": "f19a9b9a2dd2d7ebaa58eb2c98ecf702", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -421,13 +421,13 @@ }, { "type": "Image", - "element_id": "d4434406b5bb0d9269431d330ec551cc", + "element_id": "f2e384e79a4fbce052f262a93ec46102", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "2.7 1.8 10g8g6g4g2gControl 0.9 24 48 72 96 120 144 168 192 Exposure time" + "text": "2.7 1.8 10g 8g 6g 4g 2g Control 0.9 24 48 72 96 120 144 168 192 Exposure time " }, { "type": "NarrativeText", @@ -471,7 +471,7 @@ }, { "type": "FigureCaption", - "element_id": "e5d46bc8ceb17f88e1cff33ecac97067", + "element_id": "7ecc0fe6294fecb9c560f3731532f042", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -501,13 +501,13 @@ }, { "type": "Image", - "element_id": "aa9468183225a7eec11024085c42365b", + "element_id": "00c6c21aa97f59dc84190f023eaaf769", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "90 2g4g6g8g10g 80 ) % 70 ( ycneciff 60 i 50 EnoitibhnI 40 i 30 20 10 0 20 40 60 80 100 120 140 160 180" + "text": "90 2g 4g 6g 8g 10g 80 ) % 70 ( y c n e c i f f 60 i 50 E n o i t i b h n I 40 i 30 20 10 0 20 40 60 80 100 120 140 160 180 " }, { "type": "Title", @@ -521,7 +521,7 @@ }, { "type": "FigureCaption", - "element_id": "950ca7babbae92e76df97f7ee57bc05c", + "element_id": "5b441ff866b6efc9e9a1d97675f16d57", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -531,7 +531,7 @@ }, { "type": "NarrativeText", - "element_id": "83f15bc914c3bfceaa571de50ab77f11", + "element_id": "0de81536210dbfaa5ed6cb4d8446887a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -541,7 +541,7 @@ }, { "type": "Header", - "element_id": "135be522765ce267b8ca6debeeec6dc4", + "element_id": "0522c4531b49419253e7fcd53a6a1849", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -551,7 +551,7 @@ }, { "type": "Header", - "element_id": "8d9bcdac558e606c913189b6ce8db44c", + "element_id": "231b7e5ddbae0ab42adfe20f19e194c1", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -561,7 +561,7 @@ }, { "type": "NarrativeText", - "element_id": "8a54dcaa0e2720786903e26e84bd9e93", + "element_id": "f19a9b9a2dd2d7ebaa58eb2c98ecf702", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -581,7 +581,7 @@ }, { "type": "FigureCaption", - "element_id": "e8f34726e919c7e2f4d00f6fcf511ef8", + "element_id": "a2c77d1d5eec0137df5cf9ad22b05b89", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -601,13 +601,13 @@ }, { "type": "Table", - "element_id": "c6738f6e333074d3151fb3b9466c26d7", + "element_id": "150b064badb909ac7549f8064cf2caba", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "icorr (A/cm2) Polarizationresistance (Ω) Inhibitorconcentration (g) bc (V/dec) ba (V/dec) Ecorr (V) (cid:3) 0.9393(cid:3) 0.8276(cid:3) 0.8825(cid:3) 0.8027(cid:3) 0.5896(cid:3) 0.5356 0246810 0.03351.94600.01630.32330.12400.0382 0.04090.05960.23690.05400.05560.0086 0.00030.00020.00015.39E-055.46E-051.24E-05 24.0910121.44042.121373.180305.650246.080 2.81631.50540.94760.43180.37720.0919" + "text": "icorr (A/cm2) Polarization resistance (Ω) Inhibitor concentration (g) bc (V/dec) ba (V/dec) Ecorr (V) (cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356 0 2 4 6 8 10 0.0335 1.9460 0.0163 0.3233 0.1240 0.0382 0.0409 0.0596 0.2369 0.0540 0.0556 0.0086 0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05 24.0910 121.440 42.121 373.180 305.650 246.080 2.8163 1.5054 0.9476 0.4318 0.3772 0.0919" }, { "type": "Title", @@ -751,7 +751,7 @@ }, { "type": "NarrativeText", - "element_id": "ef5851c1e7629b7329ac014d7fb9e9e1", + "element_id": "5f9848745de2ea8e033fa60c5c042a02", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -781,17 +781,17 @@ }, { "type": "Image", - "element_id": "3f35abf61a71e8341d4e51645645724f", + "element_id": "5d44e8f4eeacb0f7c9b351db3adb88da", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "12 10 8 0/C 6 4 2 2 4 6 8 10 Concentration (g)" + "text": "12 10 8 0 / C 6 4 2 2 4 6 8 10 Concentration (g) " }, { "type": "FigureCaption", - "element_id": "8e9636a780701abc4f16c3f890b8a83f", + "element_id": "a24d672821f7acb0bbe2c8a813debe16", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -801,7 +801,7 @@ }, { "type": "Header", - "element_id": "8a54dcaa0e2720786903e26e84bd9e93", + "element_id": "f19a9b9a2dd2d7ebaa58eb2c98ecf702", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -821,7 +821,7 @@ }, { "type": "FigureCaption", - "element_id": "6121f41a05c15afa2efe50af3e838da4", + "element_id": "d05dc63af5311bbc038193832984955e", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -831,7 +831,7 @@ }, { "type": "FigureCaption", - "element_id": "ccc8ab2aeabd9a0f745b9f0f6fcbef6e", + "element_id": "0dcaad338c239bbf75e7c2a59a932d7d", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -851,7 +851,7 @@ }, { "type": "FigureCaption", - "element_id": "d8bc58d446376a881b51208b9a8ee7b7", + "element_id": "be145f71b3a113e82026069ef4c9f433", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -861,7 +861,7 @@ }, { "type": "Header", - "element_id": "b5c1fe3f2fa0ef8280a53620dcb31175", + "element_id": "d76f8ad904c5a9e35001a900ae3ed88f", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -871,7 +871,7 @@ }, { "type": "Header", - "element_id": "cdfba543ee8ef7fdb3d8b587648cc22d", + "element_id": "9bf474b2db1e3ee9304b15bce195346b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -881,7 +881,7 @@ }, { "type": "Header", - "element_id": "8a54dcaa0e2720786903e26e84bd9e93", + "element_id": "f19a9b9a2dd2d7ebaa58eb2c98ecf702", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -891,7 +891,7 @@ }, { "type": "Title", - "element_id": "e00efc537994ab576eaec5a387a5ebc0", + "element_id": "3a414521d26b866f142091764bcc8203", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -901,7 +901,7 @@ }, { "type": "NarrativeText", - "element_id": "d277e2ba1e8cbda383b0e51703c281c8", + "element_id": "f01d2d9e61623d168a81d2ebe27caca5", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -911,7 +911,7 @@ }, { "type": "NarrativeText", - "element_id": "c90848f07a922eff3615e5aa1ee78a2f", + "element_id": "c335c40a5ae561ec29775b60fdde4801", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -921,7 +921,7 @@ }, { "type": "FigureCaption", - "element_id": "c07eeb615f8b0f2d544348b7f0655301", + "element_id": "389bd6e22f3ac105897fa0a75807197d", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -931,7 +931,7 @@ }, { "type": "NarrativeText", - "element_id": "63584e8d8b4c14d1542778c155ee4b78", + "element_id": "bebf055a92b9b826a4f5e3ca16bf26f7", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -941,7 +941,7 @@ }, { "type": "NarrativeText", - "element_id": "df69621940968ac24afd990f838f8720", + "element_id": "017d2ebcb4cb3c55c319a81f455c22cd", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -951,7 +951,7 @@ }, { "type": "NarrativeText", - "element_id": "c9b27a380aea7dc5245745a28309b5ce", + "element_id": "0af7fda53c3d6250b7db5bdca8b425fa", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -981,7 +981,7 @@ }, { "type": "NarrativeText", - "element_id": "cecb8b44c9af4b76e85155170c509729", + "element_id": "05b80da015b27ce53274cccfa0a1ffbf", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1001,7 +1001,7 @@ }, { "type": "NarrativeText", - "element_id": "037926f4964663644ec21194965e103a", + "element_id": "4fec98f0ec22ea41717dc57408c6f864", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1011,7 +1011,7 @@ }, { "type": "Formula", - "element_id": "cc05223fa08ae55b84d4d264ac735591", + "element_id": "aa6f7054b1f7b0d8d7cdbe5894c060de", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1021,17 +1021,17 @@ }, { "type": "Formula", - "element_id": "68670005ee5fcb70031fb04896b34fee", + "element_id": "aa63e6aba52eb53a896d01e2c7ccc133", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "IE ð%Þ ¼ CRo (cid:3) CR 1001 x CRo" + "text": "IE ð%Þ ¼ CRo (cid:3) CR 100 1 x CRo" }, { "type": "NarrativeText", - "element_id": "4e14cf7db9d9e827482861e7576a1d07", + "element_id": "14acadaed6bb4af2ffac3a4d5f5ccfb9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1041,7 +1041,7 @@ }, { "type": "NarrativeText", - "element_id": "5dda1fad7e503afe6240d736d50bbe7a", + "element_id": "d35e4d0bd7c3b68d87a080ef5cbc58b2", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1051,7 +1051,7 @@ }, { "type": "NarrativeText", - "element_id": "f97dc933134705c39e5cb717f7813e07", + "element_id": "01f558c62bb60f363b05d4810359dc76", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1101,7 +1101,7 @@ }, { "type": "NarrativeText", - "element_id": "01f3f73499621b0a04142f29982336c1", + "element_id": "940fdd56bdabaede9bd2d4999f0f84a6", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1111,7 +1111,7 @@ }, { "type": "Title", - "element_id": "9619869f5960ea0375b649dd8cc388a5", + "element_id": "ee7d6fc036b5c1d6c5f5ebb9bf533f01", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1121,7 +1121,7 @@ }, { "type": "NarrativeText", - "element_id": "dbfead4a6bc5e94c6d8f7de9666b6f30", + "element_id": "387e6191442eebcb5c2b536384cda585", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1131,7 +1131,7 @@ }, { "type": "Title", - "element_id": "81db7fab0806640b0cbbac862671704f", + "element_id": "9478c846b097ffc916c7b084527ae320", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1141,7 +1141,7 @@ }, { "type": "NarrativeText", - "element_id": "eaf72c6c69d317c502026ecf01d28b09", + "element_id": "35399b6ef065014d1b8aadbe32c035a6", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1151,7 +1151,7 @@ }, { "type": "Title", - "element_id": "e56261e0bd30965b8e68ed2abb15b141", + "element_id": "a0d7deccf89e42d02a9d66b0c1889689", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1161,7 +1161,7 @@ }, { "type": "ListItem", - "element_id": "5726b8fc4e58aa0b9f5c578bae2dc200", + "element_id": "5a9cda8b066365071a0f63ac2b4b9f03", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1171,7 +1171,7 @@ }, { "type": "ListItem", - "element_id": "b863f47caee2b6d11b3324058d361e15", + "element_id": "2782a9a4dd48cfa04cdbcadbb4463ae1", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1181,7 +1181,7 @@ }, { "type": "ListItem", - "element_id": "ded4a223b42867bb411f5dff514cbe8a", + "element_id": "09194d18c5f6aa76c2d46e3e40eed2a8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1191,7 +1191,7 @@ }, { "type": "ListItem", - "element_id": "a3b65d4f88d6909004419ec92682d14a", + "element_id": "bf0eb624640aa4f8eb413def565b3024", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1201,7 +1201,7 @@ }, { "type": "ListItem", - "element_id": "3cd4caf23cd72a06fbf01b16df13ec1f", + "element_id": "18f7dc6f8ce23cbb7d7c78c627e72a1a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1211,7 +1211,7 @@ }, { "type": "Header", - "element_id": "b2dc92f9e9858319664f918c69457257", + "element_id": "d3da688dd12a1b3f3194b302281a0ada", "metadata": { "data_source": {}, "filetype": "application/pdf", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index b0e9f2151d..432664935c 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -1,7 +1,7 @@ [ { "type": "Header", - "element_id": "0af8327dc6c8a1694bd0fc75da243db4", + "element_id": "05faeab0ea2447b52f301469e8a795a5", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -11,7 +11,7 @@ }, { "type": "NarrativeText", - "element_id": "869adddb184177031536477262e0dde0", + "element_id": "721bbccb8e92736ad5776ad812dc8bca", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -21,7 +21,7 @@ }, { "type": "Title", - "element_id": "e6fa42b5b4d85001b900e47c050b645b", + "element_id": "01a8470646d06d1c29b9f43915ac04e0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -31,7 +31,7 @@ }, { "type": "NarrativeText", - "element_id": "9234133787d0a6b3976b16569c0b5cf3", + "element_id": "aefc5298031dc9a700e66da55852c6b8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -41,7 +41,7 @@ }, { "type": "Title", - "element_id": "ac01687ab870e4bb6e7313db4654928a", + "element_id": "8bdb2c4b6997a1bbd7b948d7aeb402e2", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -51,7 +51,7 @@ }, { "type": "Title", - "element_id": "d641dde82cdafdae78cadfdcb9ce11c6", + "element_id": "4a134e9565b899d20e354052439c55c1", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -71,7 +71,7 @@ }, { "type": "Title", - "element_id": "adf50fc70e660740d796f43a2ba5f500", + "element_id": "9d8efece3117b2eec928f8ee4d4888e4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -121,7 +121,7 @@ }, { "type": "NarrativeText", - "element_id": "32133fc9f028473fb3d3d2ca24382c28", + "element_id": "4691a05d0f1f6a6f5f97a655c509cead", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -131,7 +131,7 @@ }, { "type": "NarrativeText", - "element_id": "dc4030a630e58a9d83ca4b1663c14a14", + "element_id": "f1a6d69797383afd30aab391b741456b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -161,7 +161,7 @@ }, { "type": "ListItem", - "element_id": "5810d7d862f5f5d65e257a3ed9b102ac", + "element_id": "ab239efbc2d02a36dbbe66133ec68bf4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -181,7 +181,7 @@ }, { "type": "NarrativeText", - "element_id": "e326e74f4607af7d370e049bc5d9e66a", + "element_id": "f64bebb0be23116b44b4ad54968178a0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -191,7 +191,7 @@ }, { "type": "Title", - "element_id": "39826c423283dfd91f1dbd34664ce038", + "element_id": "83c8f5958f7caaf2d10c1d0692b48beb", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -201,13 +201,13 @@ }, { "type": "Table", - "element_id": "1cec53b2a6a74e4028601d759d084022", + "element_id": "81d9754d9d3f2a891323ee25d9496ffe", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "Subject areaOperations researchMore specific subject area Vehicle schedulingType of dataHow data were acquired Tables, text filesArtificially generated by a C þ þ program on Intels Xeons CPU E5–2670 v2 with Linux operating system.RawSixty randomly generated instances of the MDVSP with the number ofdepots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000)Randomly generated instancesIITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India.Data can be downloaded from https://orlib.uqcloud.net/Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R.,2018. A new formulation and a column generation-based heuristic forthe multiple depot vehicle scheduling problem. TransportationResearch Part B: Methodological, 118, pp. 457–487 [3]. Data formatExperimental factors Experimental featuresData source locationData accessibilityRelated research article" + "text": "Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired Tables, text files Artificially generated by a C þ þ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8, 12, 16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3]. Data format Experimental factors Experimental features Data source location Data accessibility Related research article" }, { "type": "NarrativeText", @@ -251,7 +251,7 @@ }, { "type": "Title", - "element_id": "e63f0ed399f0537c9ffeadfcae3baed6", + "element_id": "12edc7514919a752d95d3b8e626e45ca", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -271,7 +271,7 @@ }, { "type": "ListItem", - "element_id": "bd7d750cb9f652c80c17a264072b8858", + "element_id": "fc52e4abc1c21ce5eefec2a08463bf06", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -281,7 +281,7 @@ }, { "type": "ListItem", - "element_id": "92bb89334947a9bff49f4e2895ef0c51", + "element_id": "69010f31402211bc0029986b9c2c2a27", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -301,7 +301,7 @@ }, { "type": "ListItem", - "element_id": "24d7f2ed4386a169639b93a5bf03fd79", + "element_id": "8d1d4a30fe1af010fb36301a2ad547b3", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -321,7 +321,7 @@ }, { "type": "Title", - "element_id": "1c3f3de4e65aae5bd147f84779712a65", + "element_id": "869087d0a7770d68ff7b67de54507c6d", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -331,7 +331,7 @@ }, { "type": "NarrativeText", - "element_id": "07732da32c53fed3ffd5342c61ab643b", + "element_id": "7ebd5adce610926720d9bbc0bac01453", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -361,17 +361,17 @@ }, { "type": "NarrativeText", - "element_id": "8ca260e031eaab2e60b6eb7d3231e6bf", + "element_id": "4e5faed345ed29d23513a466e412ec0a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "i , a start location, ls i , and an end location, lei , i , an end time, te and" + "text": "i , a start location, ls i , and an end location, le i , i , an end time, te and" }, { "type": "ListItem", - "element_id": "dcb60b2d7218e86946c2235aad0b6008", + "element_id": "f2506416c6fe350d3366f89be8d932cb", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -381,7 +381,7 @@ }, { "type": "NarrativeText", - "element_id": "1c2201af9853b59ded4805bba287a829", + "element_id": "6e53e12df12d0e3d2cc75a0a91fca6c1", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -391,7 +391,7 @@ }, { "type": "Header", - "element_id": "28b33efedc139452525a280e548c029b", + "element_id": "bd9192f280163aee290b5bf5794af67f", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -401,7 +401,7 @@ }, { "type": "Header", - "element_id": "8d0736d21edd4e194e5db02347e129c7", + "element_id": "906c45a5da3d7f5a24920bb99bcadb9c", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -411,7 +411,7 @@ }, { "type": "Header", - "element_id": "e326e74f4607af7d370e049bc5d9e66a", + "element_id": "f64bebb0be23116b44b4ad54968178a0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -421,7 +421,7 @@ }, { "type": "NarrativeText", - "element_id": "9f77f0db3a785a5bb491fb79fe54cfa0", + "element_id": "a34ec26cfe7e88199e3455fc4297cf2e", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -431,7 +431,7 @@ }, { "type": "NarrativeText", - "element_id": "694b9c582265698bf49806b056c64adc", + "element_id": "5f9fb229a1da066e2639569d950c70b8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -451,7 +451,7 @@ }, { "type": "ListItem", - "element_id": "e46a5a30f05d06e82d8b7d10448de683", + "element_id": "d6b86e26ceb3141a47f4f3f4f8ef75d3", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -461,7 +461,7 @@ }, { "type": "NarrativeText", - "element_id": "1c59f2a7ce8a3fa55810df93d58e636e", + "element_id": "4e15b7ef25e73d8d7d628348abe01717", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -471,7 +471,7 @@ }, { "type": "NarrativeText", - "element_id": "149eebcec86a1b9a43b93af13952870b", + "element_id": "1216750872db48abd1b02374f86a64b4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -481,7 +481,7 @@ }, { "type": "NarrativeText", - "element_id": "e731dc92fddc0512e142bfb2bed62bbf", + "element_id": "8f4fa84bc5bb3afefc329c7ee4035de8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -501,13 +501,13 @@ }, { "type": "Table", - "element_id": "aa14fa2b3e26b2da889e9f80a7064bb3", + "element_id": "d8342c86b29d5479db19ae1beff8f957", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "Instance size (m, n) Average number of Locations Times Vehicles (8, 1500)(8, 2000)(8, 2500)(8, 3000)(12, 1500)(12, 2000)(12, 2500)(12, 3000)(16, 1500)(16, 2000)(16, 2500)(16, 3000) 568.40672.80923.40977.00566.00732.60875.001119.60581.80778.00879.001087.20 975.201048.001078.001113.20994.001040.601081.001107.40985.401040.601083.201101.60 652.20857.201082.401272.80642.00861.201096.001286.20667.80872.401076.401284.60 668,279.401,195,844.801,866,175.202,705,617.00674,191.001,199,659.801,878,745.202,711,180.40673,585.801,200,560.801,879,387.002,684,983.60" + "text": "Instance size (m, n) Average number of Locations Times Vehicles (8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000) 568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20 975.20 1048.00 1078.00 1113.20 994.00 1040.60 1081.00 1107.40 985.40 1040.60 1083.20 1101.60 652.20 857.20 1082.40 1272.80 642.00 861.20 1096.00 1286.20 667.80 872.40 1076.40 1284.60 668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60" }, { "type": "Title", @@ -621,7 +621,7 @@ }, { "type": "Header", - "element_id": "e326e74f4607af7d370e049bc5d9e66a", + "element_id": "f64bebb0be23116b44b4ad54968178a0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -631,7 +631,7 @@ }, { "type": "Header", - "element_id": "dd1252fa6e5f6c3f43669c9cc95952e7", + "element_id": "867ab47af6a021b50e74bbbc6f471965", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -651,13 +651,13 @@ }, { "type": "Table", - "element_id": "a557a4e8f1aa6814ae2a8f82e36f49e1", + "element_id": "ff667ddf988229560eaac54fc38ddc66", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Number oflines Number of columns ineach line Description 11n 3m4 The number of depots, the number of trips, and the number of locations.The number of vehicles rd at each depot d.One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location lstime tsi and the end time tei for the corresponding trip.Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i andlocation j. i , the start i , the end location le l l" + "text": "Number of lines Number of columns in each line Description 1 1 n 3 m 4 The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j. i , the start i , the end location le l l" }, { "type": "Title", @@ -761,7 +761,7 @@ }, { "type": "Title", - "element_id": "5b0294965f25f778012e27476e7ec042", + "element_id": "8502cf83321433fad96e32b2c8d087d2", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -771,7 +771,7 @@ }, { "type": "NarrativeText", - "element_id": "7797ef2531aca66f38fffe385b0a7cd1", + "element_id": "6e5b170122f44d4d618cc1f3f61f03c8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -781,7 +781,7 @@ }, { "type": "NarrativeText", - "element_id": "4ddef4f1d3c214f1ec68b83dd5ebb497", + "element_id": "db341e650c62640867aa97c59d460245", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -791,7 +791,7 @@ }, { "type": "Title", - "element_id": "81db7fab0806640b0cbbac862671704f", + "element_id": "9478c846b097ffc916c7b084527ae320", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -801,7 +801,7 @@ }, { "type": "NarrativeText", - "element_id": "8f0264ba00616d29c2648dc51f24b439", + "element_id": "5253afbe39ab88206db982fbd37c7d37", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -811,7 +811,7 @@ }, { "type": "Title", - "element_id": "e56261e0bd30965b8e68ed2abb15b141", + "element_id": "a0d7deccf89e42d02a9d66b0c1889689", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -821,7 +821,7 @@ }, { "type": "ListItem", - "element_id": "c908229ed578a9ce4166099fccc82ecf", + "element_id": "9b36c6bf583abcc308f63e2cbd455d58", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -831,7 +831,7 @@ }, { "type": "ListItem", - "element_id": "47c7ba5982d990629bf3eb6600d81d22", + "element_id": "a8e92ff1d5b95768eb45e114f106924b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -841,7 +841,7 @@ }, { "type": "ListItem", - "element_id": "c68a334dbad5df3d61ac8340f9d924f0", + "element_id": "344dbdfd5bdab54d523a6df3554cafc9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -851,7 +851,7 @@ }, { "type": "ListItem", - "element_id": "bde1d39e69305554a62aa021a4be4aaa", + "element_id": "bacca7917f0f2acdfb92f639034f29e6", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -861,7 +861,7 @@ }, { "type": "ListItem", - "element_id": "cb86b032337bb0863d6af52677251459", + "element_id": "4c6b282577604e5d857d3ec1825620cb", "metadata": { "data_source": {}, "filetype": "application/pdf", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json index 9513c41b1c..9ecf7f2642 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json @@ -1,7 +1,7 @@ [ { "type": "Header", - "element_id": "cda1ae2f061dbdafb3374e6411d3823e", + "element_id": "e498b91ec0d6b552478a251e258dbfc1", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -21,7 +21,7 @@ }, { "type": "Title", - "element_id": "0302f9e0f412cb4c63f13818e571c25c", + "element_id": "9cb98ae81ff3b59a1df96854de9cc1a8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -51,7 +51,7 @@ }, { "type": "Title", - "element_id": "44b59a545030365cd1ad225ed05ff22d", + "element_id": "f7495847504c2a5bd3366dfdbaa3cbd3", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -61,7 +61,7 @@ }, { "type": "Header", - "element_id": "d7106f2241a37dc4e61314f45da1ff5b", + "element_id": "17983340c4d298b6582155f71a460c48", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -71,7 +71,7 @@ }, { "type": "NarrativeText", - "element_id": "7ffd3b09cb23fc26ab2411d70e53838a", + "element_id": "ead6dc671b504f5520b80a19f7370322", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -81,7 +81,7 @@ }, { "type": "NarrativeText", - "element_id": "c02ccab64d2a356a96f5394a2b92fa0b", + "element_id": "0eb01f979318084d4714e13a8acdbc47", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -91,7 +91,7 @@ }, { "type": "NarrativeText", - "element_id": "d981d6dfaa8794c0bb733db0965b2831", + "element_id": "0a89d99f27e887124c8b86ca0fb5a969", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -111,7 +111,7 @@ }, { "type": "NarrativeText", - "element_id": "80abb04ec613b1d325ce6b8d0bb3349d", + "element_id": "16b4f1b50e1f1a2c19a286eb141d4819", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -121,7 +121,7 @@ }, { "type": "NarrativeText", - "element_id": "3f834ac0bf8b0dbd8d64ee065820467f", + "element_id": "7780dc842e365b3bab39812509ed1333", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -131,7 +131,7 @@ }, { "type": "NarrativeText", - "element_id": "117f7774fd093a60d964cc5b461f3e22", + "element_id": "8179530b70cc86671c5774ff7989d216", "metadata": { "data_source": {}, "filetype": "application/pdf", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index e713f94af2..88c7014584 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -11,13 +11,13 @@ }, { "type": "Header", - "element_id": "bac05707b1e00f5f57d8c702c068dc49", + "element_id": "2b55904c2fe851254b0221734a72deb4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "nuJ 12 ] VC.sc[" + "text": "n u J 1 2 ] V C . s c [" }, { "type": "UncategorizedText", @@ -31,7 +31,7 @@ }, { "type": "Title", - "element_id": "2f7cc75f6467bba468022c4c2875335e", + "element_id": "72c8dc05682ffa4a143f4e59501ecd84", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -41,7 +41,7 @@ }, { "type": "NarrativeText", - "element_id": "7d5d472da16528a310bc18c9682ed62d", + "element_id": "4da978e09ded2156643e427e0f71cf88", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -51,7 +51,7 @@ }, { "type": "NarrativeText", - "element_id": "4fcc5b6364213b1efa9272bdce4f9fcd", + "element_id": "f54e4537371345c448859842761ec65e", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -61,7 +61,7 @@ }, { "type": "NarrativeText", - "element_id": "be90d2640470e975e3402d19ba2c66cf", + "element_id": "c8cfb1cf0c9e6709f7ece87e26094434", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -71,7 +71,7 @@ }, { "type": "NarrativeText", - "element_id": "e66a3d2b6c9a872c53e226d8e0cc0a0e", + "element_id": "98f930d9b2f5bbc33cafb4bbe439dc63", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -81,7 +81,7 @@ }, { "type": "Title", - "element_id": "3fa53fc0dab8ef96d05d8fd4c7e41b49", + "element_id": "8de2a6e07c3e4df67625a60f079764a4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -91,7 +91,7 @@ }, { "type": "NarrativeText", - "element_id": "bca638b88125eed8a8003e46a6055618", + "element_id": "2a92239d820b34481bc23e44e7767b05", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -101,7 +101,7 @@ }, { "type": "Title", - "element_id": "0119810584ee0b01e4d14dfd8c250bf2", + "element_id": "0778dedc6a5e78f11b66915003e6afde", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -111,7 +111,7 @@ }, { "type": "NarrativeText", - "element_id": "82d5520be5fd847464727f56151d316c", + "element_id": "c749ec9752e4f4b69435de6279e1d2d5", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -121,7 +121,7 @@ }, { "type": "NarrativeText", - "element_id": "c1f1ba1630bc19bd24c1dfbc1548f2d8", + "element_id": "147e0762bc4da382a2b673af087c7c10", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -131,7 +131,7 @@ }, { "type": "NarrativeText", - "element_id": "836e6ef5cecc9a73356c0d5bee181829", + "element_id": "641105bcd721926cf18a80c1a138f998", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -141,7 +141,7 @@ }, { "type": "ListItem", - "element_id": "6c42f77e0ba5dfe7336a4c1a4fce00e4", + "element_id": "b63bb581fef5728c1d27774ceb25b22f", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -161,7 +161,7 @@ }, { "type": "ListItem", - "element_id": "50f59772d4134ececeaf37069d480784", + "element_id": "f053840b2f6b2786b33533441b034e08", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -181,7 +181,7 @@ }, { "type": "ListItem", - "element_id": "9a576fe6eb4355cdf1e772cf462a9eb7", + "element_id": "ec6b1abc1c5e033ef92ab38cfee3b80b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -191,7 +191,7 @@ }, { "type": "ListItem", - "element_id": "18b1855acfb386ae6e6a253da566e93b", + "element_id": "f98e90f859af76d260aa96b86cc57e27", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -201,7 +201,7 @@ }, { "type": "NarrativeText", - "element_id": "1f0f5df7c23d4f8e8de4de3085abd7d8", + "element_id": "16e14d7bf31f029a5813c557a245af20", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -211,7 +211,7 @@ }, { "type": "NarrativeText", - "element_id": "aebca7fedab12541cd5af93183b619e9", + "element_id": "8a585b845f71ae5edf85991e661227f2", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -221,7 +221,7 @@ }, { "type": "Header", - "element_id": "4c2478cf439baab6ace34761eda527d9", + "element_id": "72c43f1a56f77795dacfdfe034f47354", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -231,7 +231,7 @@ }, { "type": "NarrativeText", - "element_id": "74a7758f83612467af8eea9d20e4a6f7", + "element_id": "31cbd7fa4b6e5d40d4f5de572cd7bd86", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -241,7 +241,7 @@ }, { "type": "NarrativeText", - "element_id": "9b8fc4816306f4f1b31874d53134979b", + "element_id": "d58c4eaa08e6195595233bbeb87ae331", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -251,7 +251,7 @@ }, { "type": "Title", - "element_id": "1513104c7bf6cd40223a7cc23798378f", + "element_id": "d2e1a97795476cc4aba05a95edc57860", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -261,7 +261,7 @@ }, { "type": "NarrativeText", - "element_id": "181670e0d50864954486b337b1d19118", + "element_id": "13105a0ebbeccd12a8785d2b83752720", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -271,7 +271,7 @@ }, { "type": "NarrativeText", - "element_id": "0ebd432f2495d0bfc8303eca930cc9e5", + "element_id": "1d6246af89ad959f51c99ebf74ba8819", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -281,7 +281,7 @@ }, { "type": "NarrativeText", - "element_id": "19c6112edbf782bfc4f5cf04829f57ad", + "element_id": "5051f4837b5344032779c265056506c8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -301,7 +301,7 @@ }, { "type": "ListItem", - "element_id": "00e84a3be86673ff9bb8476f5132d4bf", + "element_id": "c38b21939804a379980291ee06dc0387", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -321,7 +321,7 @@ }, { "type": "NarrativeText", - "element_id": "3fd5be2cdc473424f58b9da0192dec01", + "element_id": "5aca548166e8ec0e670e39fbe0520256", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -331,7 +331,7 @@ }, { "type": "NarrativeText", - "element_id": "cd924e18fd419111b4ead552fb7cc36b", + "element_id": "787c5cea13c47adb33fa8ea482b1180c", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -341,7 +341,7 @@ }, { "type": "NarrativeText", - "element_id": "de9b76ca2c36f4a1cc39c9bc69b75a45", + "element_id": "ebcf4a16b5baa55fe81e8b98f1443006", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -351,7 +351,7 @@ }, { "type": "Title", - "element_id": "740238ba10202556c962840e5c882446", + "element_id": "32e60b294a649b54069f8fdce6d3658e", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -361,7 +361,7 @@ }, { "type": "NarrativeText", - "element_id": "1f4bc06117d2be9d9e297dbe07aa05cd", + "element_id": "995c62dc17faaf10b13aa42abefe3ec6", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -371,7 +371,7 @@ }, { "type": "Header", - "element_id": "4c2478cf439baab6ace34761eda527d9", + "element_id": "72c43f1a56f77795dacfdfe034f47354", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -381,7 +381,7 @@ }, { "type": "FigureCaption", - "element_id": "b51f99cb953082a922ba43c09d4492b3", + "element_id": "77f06cc5e2c90400a79f0109988476c5", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -391,14 +391,14 @@ }, { "type": "Table", - "element_id": "120c712c3b2e7c5572e9207c10a5c435", + "element_id": "c57f2166778009c6ccc9032ee8883253", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5, "text_as_html": "
Dataset| Base Model'| Large Model| Notes
PubLayNet B8]|F/MMLayouts of modern scientific documents
PRImAM-Layouts of scanned modern magazines and scientific reports
NewspaperF-Layouts of scanned US newspapers from the 20th century
TableBankFFTable region on modern scientific and business document
HJDatasetF/M-Layouts of history Japanese documents
" }, - "text": "Base Model1 Large Model Notes Dataset PubLayNet [38]PRImA [3]Newspaper [17]TableBank [18]HJDataset [31] F / MMFFF / M M--F- Layouts of modern scientific documentsLayouts of scanned modern magazines and scientific reportsLayouts of scanned US newspapers from the 20th centuryTable region on modern scientific and business documentLayouts of history Japanese documents" + "text": "Base Model1 Large Model Notes Dataset PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents" }, { "type": "Title", @@ -462,7 +462,7 @@ }, { "type": "Footer", - "element_id": "c24bcb2cf98d6226bd805b6f99d3b61a", + "element_id": "f58e29ccb8c8b46f5876aec8c9684294", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -472,7 +472,7 @@ }, { "type": "NarrativeText", - "element_id": "9fb9573af5bf767f81cdaf2cf1a72cd9", + "element_id": "e937103da807b2445f302bd678119123", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -482,7 +482,7 @@ }, { "type": "Title", - "element_id": "9f26ca353a2c130a2e32f457d71c1350", + "element_id": "05df12485855b8bfb3552abbf90cfad4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -492,7 +492,7 @@ }, { "type": "NarrativeText", - "element_id": "11dff8778699e76422be6b86c9eaa62a", + "element_id": "2e74c8e7455e99acd9d85a11db6b5f6e", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -502,17 +502,17 @@ }, { "type": "NarrativeText", - "element_id": "bdcad7be96e533af709aaccaff3bf7e7", + "element_id": "41140d5c71e1ee278c01309bf0cb354a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "1 import layoutparser as lp2 image = cv2 . imread ( \" image_file \" ) # load images3 model = lp . De t e c tro n2 Lay outM odel ( \" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config \" ) 45 layout = model . detect ( image )" + "text": "1 import layoutparser as lp 2 image = cv2 . imread ( \" image_file \" ) # load images 3 model = lp . De t e c tro n2 Lay outM odel ( \" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config \" ) 4 5 layout = model . detect ( image )" }, { "type": "NarrativeText", - "element_id": "3aff40c86aa58c0362102802f4ef172f", + "element_id": "812cacf0b2c9dd62a37d4361ab3e6eec", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -522,7 +522,7 @@ }, { "type": "NarrativeText", - "element_id": "5c44994a44f74b706d8a5e74cd753a8b", + "element_id": "2e73b03555667b9209d10b34a5f3a411", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -542,7 +542,7 @@ }, { "type": "FigureCaption", - "element_id": "cafae07120d714f0822e89865adf62da", + "element_id": "8625c5524829e0a8a6bc7e96d68b20f2", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -552,7 +552,7 @@ }, { "type": "NarrativeText", - "element_id": "7461d30ee7c51c91bca8003792d43bfe", + "element_id": "dbae85429e87bb2c7eb4c9c942854fae", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -562,7 +562,7 @@ }, { "type": "Title", - "element_id": "acd4f4584a990134d927e19b6d7e5f88", + "element_id": "090531a7f1a39471d158a00394cb6568", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -572,7 +572,7 @@ }, { "type": "NarrativeText", - "element_id": "fb271c99cdcfca1001a1a7d56425c5b4", + "element_id": "f0a4ff01ca0d75d7cc17c8449b5390bd", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -582,7 +582,7 @@ }, { "type": "Title", - "element_id": "4c2478cf439baab6ace34761eda527d9", + "element_id": "72c43f1a56f77795dacfdfe034f47354", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -592,7 +592,7 @@ }, { "type": "NarrativeText", - "element_id": "f2a3e5fbb983d9132dddecc381ed6e0b", + "element_id": "e9b37f0afee4d75cdf9aa1ca1ac2c16a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -602,7 +602,7 @@ }, { "type": "NarrativeText", - "element_id": "eec800eef6e395c21feacd729868dd18", + "element_id": "92d5fb14680a0049488ec358e18011d7", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -612,7 +612,7 @@ }, { "type": "Title", - "element_id": "89c6cd1d893f782ea68d75737e3393fd", + "element_id": "84150b7fe95fda995dc59970e71d3b6b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -622,7 +622,7 @@ }, { "type": "NarrativeText", - "element_id": "e284bd66511cfa064681253e7ac57a9a", + "element_id": "2c44a3b1b50fe609dd4e599aaa9a2a42", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -632,7 +632,7 @@ }, { "type": "NarrativeText", - "element_id": "55ab2654fa8c2c01de322b52f4fad508", + "element_id": "82d51f922a472021605996e1090eb1ff", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -642,7 +642,7 @@ }, { "type": "NarrativeText", - "element_id": "7a151dbbe8b26ccdcb264ab005be5a36", + "element_id": "a5aaa103ed722fa6bfd700ec8edd554f", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -652,7 +652,7 @@ }, { "type": "NarrativeText", - "element_id": "77ccbf022ce60ecfc6bac26bc6306a1d", + "element_id": "c7020e982bb4fc78992a74d74b2db961", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -672,7 +672,7 @@ }, { "type": "ListItem", - "element_id": "1e86062fd626f6ffe96ea28a7ff8f1df", + "element_id": "b2f82a8b9096623d70752bdee08e3703", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -682,7 +682,7 @@ }, { "type": "NarrativeText", - "element_id": "6727ba436ddf5e47087d005ded6c049f", + "element_id": "128743def9375aa9fbfc67ceeb31ec04", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -712,14 +712,14 @@ }, { "type": "Table", - "element_id": "1c70e4dd20e663ba4fcaa60af53adcbd", + "element_id": "8f963c226c45bef6fa7f2644bc20b149", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 8, "text_as_html": "
block.pad(top, bottom,right,left)Enlarge the current block according to the input
block.scale(fx, fy)Scale the current block given the ratio in x and y direction
block.shift(dx, dy)Move the current block with the shift distances in x and y direction
block1.is_in(block2)Whether block] is inside of block2
block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs.
block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs.
block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
block1.condition_on(block2) block. crop_image (image)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates Obtain the image segments in the block region
" }, - "text": "block.pad(top, bottom, right, left) Enlarge the current block according to the input Scale the current block given the ratioin x and y direction block.scale(fx, fy) Move the current block with the shiftdistances in x and y direction block.shift(dx, dy) Whether block1 is inside of block2 block1.is in(block2) block1.intersect(block2) block1.union(block2) Convert the absolute coordinates of block1 torelative coordinates to block2 block1.relative to(block2) Calculate the absolute coordinates of block1 giventhe canvas block2’s absolute coordinates block1.condition on(block2)" + "text": "block.pad(top, bottom, right, left) Enlarge the current block according to the input Scale the current block given the ratio in x and y direction block.scale(fx, fy) Move the current block with the shift distances in x and y direction block.shift(dx, dy) Whether block1 is inside of block2 block1.is in(block2) block1.intersect(block2) block1.union(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.relative to(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block1.condition on(block2)" }, { "type": "NarrativeText", @@ -893,7 +893,7 @@ }, { "type": "Title", - "element_id": "709c2d8cd3b15512f8452715fab45e4f", + "element_id": "0c8b5878c9434b010582e935dc19da9b", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -913,7 +913,7 @@ }, { "type": "Title", - "element_id": "d57c26aad19349fc98ee8822f24f19d9", + "element_id": "0778f018e259bb4c53ad7e04a5aaa9af", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -923,7 +923,7 @@ }, { "type": "NarrativeText", - "element_id": "e07a3053a112880cf693f019d010cc19", + "element_id": "27a0e8b4541ec34bfdf6c52b7e06237d", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -933,7 +933,7 @@ }, { "type": "NarrativeText", - "element_id": "1ef9621705354b738772d2108d0fb6ab", + "element_id": "337a2fd3485821eeaf5d9ada5c7ab74a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -943,7 +943,7 @@ }, { "type": "ListItem", - "element_id": "4c2478cf439baab6ace34761eda527d9", + "element_id": "72c43f1a56f77795dacfdfe034f47354", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -963,7 +963,7 @@ }, { "type": "NarrativeText", - "element_id": "cc8ad6e0f933633a37b82200e6724f9e", + "element_id": "0770888d2e26bc88f295478d836d13bc", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -973,7 +973,7 @@ }, { "type": "NarrativeText", - "element_id": "19cc210888c40b3403e1992b335bccf7", + "element_id": "2e944a19489d47a5c007d491534ab343", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -983,7 +983,7 @@ }, { "type": "NarrativeText", - "element_id": "60c0620e0e68ad30f5cff23dd0ef53c5", + "element_id": "4da48f397c533740d510fc9004f29a1f", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -993,7 +993,7 @@ }, { "type": "NarrativeText", - "element_id": "7cc706ff50f3746845f312c011318d84", + "element_id": "8b762e8d2e1bbc81253ae6eb13e486ad", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1003,7 +1003,7 @@ }, { "type": "ListItem", - "element_id": "9bf176adca2cfa747e7f0255bfc3594a", + "element_id": "28df6f16ea44b96ebf6c8b686715f5ea", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1023,7 +1023,7 @@ }, { "type": "NarrativeText", - "element_id": "f6d1c03644c4866a2dd06f8e432f6286", + "element_id": "81c96876fd12f0bd68cfe4cebc910036", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1033,7 +1033,7 @@ }, { "type": "Title", - "element_id": "a84f27645308850514566b3bb9d3efa0", + "element_id": "abf674029db2adb9d1944b45561de416", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1043,7 +1043,7 @@ }, { "type": "NarrativeText", - "element_id": "ea475aba47ae4b4db2eeb1a96bc30797", + "element_id": "ea958ca4de2431c62a01a51ace78c921", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1053,7 +1053,7 @@ }, { "type": "NarrativeText", - "element_id": "966440df8a08ef481c35486bdb301d6a", + "element_id": "c37820b206a0ea27d42b4db4da758759", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1063,7 +1063,7 @@ }, { "type": "Title", - "element_id": "0e654d0b0bc44cbd58f1cb7c7b02c3c5", + "element_id": "1f9963e29820a25df2f26ba5f22fd733", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1073,7 +1073,7 @@ }, { "type": "NarrativeText", - "element_id": "9d4feeabd8c04d4b33897afb58e46f55", + "element_id": "ca963177810f8c3b64522859751655e7", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1083,7 +1083,7 @@ }, { "type": "Title", - "element_id": "4c2478cf439baab6ace34761eda527d9", + "element_id": "72c43f1a56f77795dacfdfe034f47354", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1093,7 +1093,7 @@ }, { "type": "NarrativeText", - "element_id": "5cdbcea58a81d8f7de9a4fa841107be1", + "element_id": "c5652cdf729305eb4b474db73d9732a4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1103,7 +1103,7 @@ }, { "type": "Title", - "element_id": "1fe1cb84a12b8216ea9d734262b3e4ec", + "element_id": "c2db3c9ed08076430c7524b7e3c1076f", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1133,7 +1133,7 @@ }, { "type": "NarrativeText", - "element_id": "fa19ab2536cbbb48c09de29fdebd52bd", + "element_id": "7f729881ca5ef5ee556e96246fa78528", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1143,7 +1143,7 @@ }, { "type": "NarrativeText", - "element_id": "3cbd8234ac0c6d29feb24e6202144aa8", + "element_id": "4a938b67ce50050a388cb3a1d19e5599", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1153,7 +1153,7 @@ }, { "type": "NarrativeText", - "element_id": "a15f39c44cf16af58226245db3862c6e", + "element_id": "dd094fd6dc74291ed1c6612d65a060e0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1163,7 +1163,7 @@ }, { "type": "Title", - "element_id": "de2a222ad7b9cf1e5e5432f53c15996d", + "element_id": "e3221fd8edd58a8c026465fab665d928", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1173,7 +1173,7 @@ }, { "type": "NarrativeText", - "element_id": "7174760d4c8d9b7b13da3918015312dc", + "element_id": "fd8fa97590b686b54db41b84e0add377", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1183,7 +1183,7 @@ }, { "type": "NarrativeText", - "element_id": "9b51c55d2dd4ffd289138fc4f66e11e6", + "element_id": "e129c45423c8e88740a1237481a22cd2", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1193,7 +1193,7 @@ }, { "type": "NarrativeText", - "element_id": "888b9c9ec4431146d744bc6f39e16fd0", + "element_id": "54dc8c33b479ec74e88874cd2c0a616c", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1203,7 +1203,7 @@ }, { "type": "NarrativeText", - "element_id": "07be9fda679b805e67cf5e563eada033", + "element_id": "acfcaae53850b43f814bf04643363718", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1213,7 +1213,7 @@ }, { "type": "NarrativeText", - "element_id": "069379b2abcf2bed44f13bdaea90ec2d", + "element_id": "776ff927c4926b5a61aaa7ec9ef2e869", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1233,7 +1233,7 @@ }, { "type": "ListItem", - "element_id": "5b6b4f6a5766bdb4f09f0a0387a3a373", + "element_id": "e6aa0629950d567b0c8c8b6d1b39cd9f", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1243,7 +1243,7 @@ }, { "type": "ListItem", - "element_id": "5ac56b3874cc4fa43f9ce8fdd05bc8b5", + "element_id": "8a9ed739daba419123f271dce4be295a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1253,7 +1253,7 @@ }, { "type": "ListItem", - "element_id": "4c2478cf439baab6ace34761eda527d9", + "element_id": "72c43f1a56f77795dacfdfe034f47354", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1273,7 +1273,7 @@ }, { "type": "FigureCaption", - "element_id": "1a2b9e59d53ac38ee6affb3ffcda6b8c", + "element_id": "fed869ba2e01c3daf0816b48809a9790", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1283,7 +1283,7 @@ }, { "type": "Title", - "element_id": "76c98240da7b06b4b3fcf8109edbbaba", + "element_id": "cd43c1ef71b09de2a5c46062419dd4b4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1293,7 +1293,7 @@ }, { "type": "NarrativeText", - "element_id": "0f6c572efe499db5f3a396c3f898b39a", + "element_id": "06979af9f7bf72e15b5a4f378d5651c4", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1303,7 +1303,7 @@ }, { "type": "NarrativeText", - "element_id": "d423e43627591688a7a55d37adbf14e7", + "element_id": "1c5af18d708d6b5143f58351b5a3f059", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1313,7 +1313,7 @@ }, { "type": "NarrativeText", - "element_id": "8ff0f0a5b4e520b95b6d74614366af1e", + "element_id": "af4a07cd0c723749e00e1238a013e9b3", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1323,7 +1323,7 @@ }, { "type": "ListItem", - "element_id": "91e724833d5794abbd5fd6ad6c54aa9f", + "element_id": "34146aaef7807c3d570b0fd1d1d95a9f", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1333,7 +1333,7 @@ }, { "type": "Title", - "element_id": "a2a71736439cbc5e1445bddd40712b9b", + "element_id": "444b7d3480001e2e422559c09fc04e41", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1343,7 +1343,7 @@ }, { "type": "NarrativeText", - "element_id": "ad29b99d522a90b8084b53f55ca78e02", + "element_id": "992e926b29c1f50bd6a710a37364da57", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1353,7 +1353,7 @@ }, { "type": "NarrativeText", - "element_id": "f85505b114cf50b99bc0ae7c3805774d", + "element_id": "4e85c94ea9d4564cdfc847d0312e1776", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1363,7 +1363,7 @@ }, { "type": "Title", - "element_id": "e56261e0bd30965b8e68ed2abb15b141", + "element_id": "a0d7deccf89e42d02a9d66b0c1889689", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1373,7 +1373,7 @@ }, { "type": "ListItem", - "element_id": "f7e8d95a8f2b84a4461e037b0a7b9704", + "element_id": "5c38ef04891d707aa9ef58c34413ee54", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1383,7 +1383,7 @@ }, { "type": "ListItem", - "element_id": "24862433f743a0910da62ec3fb4f537c", + "element_id": "fdd4f75f271a06fd05f220334c267202", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1393,7 +1393,7 @@ }, { "type": "ListItem", - "element_id": "79a1f55a3945eb6304697ec72847ed35", + "element_id": "ca3a95da41baaa8e801acfb997025998", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1403,7 +1403,7 @@ }, { "type": "ListItem", - "element_id": "cafb24e03d3f74ce81ba82312af7bfc2", + "element_id": "a16bebdda43cc5159e1b7eaa648570aa", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1413,7 +1413,7 @@ }, { "type": "ListItem", - "element_id": "6871ed87adfeab0abf4784c0c72e2ebb", + "element_id": "44c08956586fa53e0d5e2c88d3eba785", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1423,7 +1423,7 @@ }, { "type": "ListItem", - "element_id": "b000578a41ffcc554faac04609d2f4e1", + "element_id": "cd068f1335b5e98ce7a668c734c3ffca", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1433,7 +1433,7 @@ }, { "type": "ListItem", - "element_id": "c6e835fe03323406543926cc0f5a94de", + "element_id": "ed21b4eaf981727504a77bc51444751c", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1443,7 +1443,7 @@ }, { "type": "ListItem", - "element_id": "4c2478cf439baab6ace34761eda527d9", + "element_id": "72c43f1a56f77795dacfdfe034f47354", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1463,7 +1463,7 @@ }, { "type": "ListItem", - "element_id": "c8f5863d94cc9b9d77f153c6d1b0015a", + "element_id": "ade6873735c6be9677804c9634ed9dec", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1483,7 +1483,7 @@ }, { "type": "ListItem", - "element_id": "4d54eb351d8fc3bfbbf7286aa15eabe3", + "element_id": "77589f390ec51c072e05d82c60dcfeab", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1493,7 +1493,7 @@ }, { "type": "ListItem", - "element_id": "7ceaba2290e3f9c5f3754032ce4d5663", + "element_id": "8f1508523763d343728246a315e7ce89", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1503,7 +1503,7 @@ }, { "type": "ListItem", - "element_id": "18d58ed781efccf5e09bcab4064fe090", + "element_id": "e3c8484f21642805e61903a9aa4adba8", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1513,7 +1513,7 @@ }, { "type": "ListItem", - "element_id": "1f1a0fac1bae95f076ea34c955551632", + "element_id": "fbb53a1b98120c451da95a4666c78594", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1523,7 +1523,7 @@ }, { "type": "ListItem", - "element_id": "0aabfb2a8e358618179ec2e1d322e519", + "element_id": "490e08d6f295c2172516ec04be8a530a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1533,7 +1533,7 @@ }, { "type": "ListItem", - "element_id": "df18427a8013b4df36e8ac4e2ee5da3a", + "element_id": "2ce4e05a66c2f0f335cb795e064dfe26", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1543,7 +1543,7 @@ }, { "type": "ListItem", - "element_id": "257e7b8aef89c41e03bf837ea517885e", + "element_id": "ed3fbd8b8776e164e7d02dfa57e7f8fe", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1553,7 +1553,7 @@ }, { "type": "ListItem", - "element_id": "00c7abdd98fedd1746994d16ca44d45f", + "element_id": "8f319c83ff95c5f927749171e57d44fb", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1563,7 +1563,7 @@ }, { "type": "ListItem", - "element_id": "7a0afd734c99f6b076dc58b2e57cfec6", + "element_id": "3f1e59a4c91bb3b88df6c357b1d834dc", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1573,7 +1573,7 @@ }, { "type": "ListItem", - "element_id": "00d6ff1b3fb21f8a608f3b6269df56be", + "element_id": "d9de765c3196caee0fb4a1d730eed52c", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1583,7 +1583,7 @@ }, { "type": "ListItem", - "element_id": "deecdfacbce71dd1425fd54010b2fad1", + "element_id": "7b9980045cdd3d64209cbc2d35575252", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1593,7 +1593,7 @@ }, { "type": "ListItem", - "element_id": "c46384f7d585f482420cd9e0e10ef4af", + "element_id": "bb9e1ef93096188e34c6be00a58ead5c", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1613,7 +1613,7 @@ }, { "type": "ListItem", - "element_id": "21d151e4c182a1f441c3486d2f79afc0", + "element_id": "0f799e1e66cc92cf9d0d88aa327e46a1", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1623,7 +1623,7 @@ }, { "type": "ListItem", - "element_id": "c9d8f6434425015c72f94fb212bba28f", + "element_id": "73731045a017a4af5d28a0d329a5b3ae", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1633,7 +1633,7 @@ }, { "type": "ListItem", - "element_id": "9c3e13a0e9738b846289bff06952da3b", + "element_id": "80e6b0ce1a9da436be112ea81ae6f742", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1653,7 +1653,7 @@ }, { "type": "ListItem", - "element_id": "bd680d8baa57cc15337de2e0c299d121", + "element_id": "5cce0aff59eb4a6c10cacd3a5b1dca3c", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1673,7 +1673,7 @@ }, { "type": "ListItem", - "element_id": "9dce913bddaa63724f5de64e539b7016", + "element_id": "090d32c2a94f4e868cafd598692d005a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1683,7 +1683,7 @@ }, { "type": "ListItem", - "element_id": "4c8ddc159ec208bb7f454603fcd7c4bd", + "element_id": "916d069c08c9d5bc3408ef856cb496e0", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1703,7 +1703,7 @@ }, { "type": "ListItem", - "element_id": "93d261a89a8422fb8d166e6cdf95d8f6", + "element_id": "1cad6b363bfdd2605da0d40f6007060a", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1713,7 +1713,7 @@ }, { "type": "ListItem", - "element_id": "6c94dd219ce339c358163833e20d099e", + "element_id": "b4233941aba94560bfacb596c7b3bb52", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1733,7 +1733,7 @@ }, { "type": "ListItem", - "element_id": "2625b6830768eac986cfee208c0270de", + "element_id": "6a2460e2677fe2a88aad7c65764e06e9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -1753,12 +1753,12 @@ }, { "type": "ListItem", - "element_id": "042006f2d2112f116d1942c22ecc1d9d", + "element_id": "e5b16f1d7676cb17c5c2177ec90987f2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "largest dataset ever for doc-In: 2019 International Conference on DocumentIEEE (Sep 2019). umentAnalysis and Recognition (ICDAR). pp. 1015–1022.https://doi.org/10.1109/ICDAR.2019.00166 layout analysis." + "text": "largest dataset ever for doc- In: 2019 International Conference on Document IEEE (Sep 2019). ument Analysis and Recognition (ICDAR). pp. 1015–1022. https://doi.org/10.1109/ICDAR.2019.00166 layout analysis." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json index ae389c27c8..b41b03a1f2 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json @@ -1,7 +1,7 @@ [ { "type": "Title", - "element_id": "30537dddb2f7525b398da1b2dfcf0255", + "element_id": "0a5f21a42e259b1ae44adc8758f2db19", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -19,7 +19,7 @@ }, { "type": "Title", - "element_id": "5642647a217c5810732bbb06ae629582", + "element_id": "c2218d09ef001e697de0e0676777a643", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -33,11 +33,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "WORLD ECONOMIC OUTLOOKUPDATEInflation Peaking amid Low Growth" + "text": "WORLD ECONOMIC OUTLOOK UPDATE Inflation Peaking amid Low Growth" }, { "type": "Title", - "element_id": "3ab232314cc69a54fea83cb81dd05413", + "element_id": "1cea24ad0aab30d447fc2b47dcd4f259", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -51,7 +51,7 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "2023JAN" + "text": "2023 JAN" }, { "type": "Image", @@ -73,7 +73,7 @@ }, { "type": "Title", - "element_id": "12d4f57c3f43b0afbdf88305940258bc", + "element_id": "e7ea93453698b4f8bc32fd7cb860617e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -91,7 +91,7 @@ }, { "type": "ListItem", - "element_id": "9fe27138e05d3a42d1e5cc57bc1fbc54", + "element_id": "dd494a076f4a875e3ff8591dd84e3bcb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -105,11 +105,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": " Global growth is projected to fall from an estimated 3.4 percent in 2022 to 2.9 percent in 2023, then rise to 3.1 percent in 2024. The forecast for 2023 is 0.2 percentage point higher than predicted in the October 2022 World Economic Outlook (WEO) but below the historical (2000–19) average of 3.8 percent. The rise in central bank rates to fight inflation and Russia’s war in Ukraine continue to weigh on economic activity. The rapid spread of COVID-19 in China dampened growth in 2022, but the recent reopening has paved the way for a faster-than-expected recovery. Global inflation is expected to fall from 8.8 percent in 2022 to 6.6 percent in 2023 and 4.3 percent in 2024, still above pre-pandemic (2017–19) levels of about 3.5 percent." + "text": "Global growth is projected to fall from an estimated 3.4 percent in 2022 to 2.9 percent in 2023, then rise to 3.1 percent in 2024. The forecast for 2023 is 0.2 percentage point higher than predicted in the October 2022 World Economic Outlook (WEO) but below the historical (2000–19) average of 3.8 percent. The rise in central bank rates to fight inflation and Russia’s war in Ukraine continue to weigh on economic activity. The rapid spread of COVID-19 in China dampened growth in 2022, but the recent reopening has paved the way for a faster-than-expected recovery. Global inflation is expected to fall from 8.8 percent in 2022 to 6.6 percent in 2023 and 4.3 percent in 2024, still above pre-pandemic (2017–19) levels of about 3.5 percent." }, { "type": "ListItem", - "element_id": "b9bde2d8da52aaab6c30a5ba04b47586", + "element_id": "69c0abb4a05d8b3650ac06c6c07c3b88", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -127,7 +127,7 @@ }, { "type": "ListItem", - "element_id": "cdf520693b6ec6dc4877bc4aedea746c", + "element_id": "40f2b406a410dadedbf14e1310e6fd94", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -163,7 +163,7 @@ }, { "type": "NarrativeText", - "element_id": "bb50ad035681bfb501e33a52abe173ad", + "element_id": "fa4f01ba2113b0b7859f01f31ef6c5b1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -181,7 +181,7 @@ }, { "type": "NarrativeText", - "element_id": "041668dbcf5b0c4114acae7ef393f5cd", + "element_id": "895af44c7b71b95db8e28f86ef2224c8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -199,7 +199,7 @@ }, { "type": "NarrativeText", - "element_id": "42213af1ed4e31e1ce00eba6ce07ee5e", + "element_id": "a4bc8a1fe50aa3ff61ba53d96830f9c1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -253,7 +253,7 @@ }, { "type": "NarrativeText", - "element_id": "15d7968ef76d05b9b7d490cd2ebe6550", + "element_id": "78f4f092ce77888950ef2172b8d2a92a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -397,7 +397,7 @@ }, { "type": "NarrativeText", - "element_id": "6814df88a59d11e9fcf76a7ed0f5fdfc", + "element_id": "2c469f8fa0f3c1c771330dde1be1b28c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -487,7 +487,7 @@ }, { "type": "NarrativeText", - "element_id": "83ce77349b07c275543d551c2c016370", + "element_id": "c146a58289e616dfc7ba0154e66a262b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -541,7 +541,7 @@ }, { "type": "NarrativeText", - "element_id": "22011dc596eec73711d7dac8d99b41b6", + "element_id": "e2faa573314abd00886d18714776ff1e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -559,7 +559,7 @@ }, { "type": "NarrativeText", - "element_id": "97e04ee873fea0151df00f7b1fb4ca42", + "element_id": "7cd3302c25869c2f5421bee2f41417be", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -577,7 +577,7 @@ }, { "type": "NarrativeText", - "element_id": "e08dfaba8a8dc7496a44cb172319d4ba", + "element_id": "3416e531cf0cb72208991b73db6ae3ef", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -595,7 +595,7 @@ }, { "type": "NarrativeText", - "element_id": "73a39336fb540e7d57ec85dfa8e92799", + "element_id": "ce9fc96e38d94a623bb7ffee822ac214", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -613,25 +613,7 @@ }, { "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "" - }, - { - "type": "ListItem", - "element_id": "e84075ae46df9d9ad37d947011c05a7f", + "element_id": "78f4bc5a836981f7b1ccfd9ad718cc72", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -685,7 +667,7 @@ }, { "type": "NarrativeText", - "element_id": "67f04acf5353c625d003fd003acb56f3", + "element_id": "8ab8fca37bfc3201f31244df6b7c4d82", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -703,7 +685,7 @@ }, { "type": "ListItem", - "element_id": "f2679b646aeff359030eec35f2758f9b", + "element_id": "2b03eaeb94d29bd57f35cc895c8e50c8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -717,11 +699,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Growth in the euro area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6 percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects theeffects of faster rate hikes by the European Central Bank and eroding real incomes, offset bythe carryover from the 2022 outturn, lower wholesale energy prices, and additionalannouncements of fiscal purchasing power support in the form of energy price controls andcash transfers." + "text": "Growth in the euro area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6 percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers." }, { "type": "ListItem", - "element_id": "39b82856add2dc690f2dcb3f2c0c1819", + "element_id": "8674194ed5ca8d731d521f16b602a7ff", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -735,11 +717,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Growth in the United Kingdom is projected to be –0.6 percent in 2023, a 0.9 percentage point downward revision from October, reflecting tighter fiscal and monetary policies and financialconditions and still-high energy retail prices weighing on household budgets." + "text": "Growth in the United Kingdom is projected to be –0.6 percent in 2023, a 0.9 percentage point downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets." }, { "type": "ListItem", - "element_id": "536529e807f3e273a05563e438f394ff", + "element_id": "e56ea100a4185b69f955d45ff914ef57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -753,11 +735,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal policy support. High corporate profits from a depreciated yen and earlier delays inimplementing previous projects will support business investment. In 2024, growth is expectedto decline to 0.9 percent as the effects of past stimulus dissipate." + "text": "Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate." }, { "type": "NarrativeText", - "element_id": "497b28af5c258708a114b8a6766662ce", + "element_id": "d2020af82c98a5ae355bf22ab3261e6b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -775,7 +757,7 @@ }, { "type": "ListItem", - "element_id": "74af5288c060a6b7bc028cc0efcf59ea", + "element_id": "94385c9d723aa1c5f156fc9fad3ccc88", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -792,7 +774,7 @@ "text": "percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to rise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024." }, { - "type": "ListItem", + "type": "NarrativeText", "element_id": "afde979c99a73646915fe253c85c5a9c", "metadata": { "data_source": { @@ -809,24 +791,6 @@ }, "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "" - }, { "type": "ListItem", "element_id": "25072141a0ed1c9474256def9a721513", @@ -865,7 +829,7 @@ }, { "type": "NarrativeText", - "element_id": "c9b8a2f221ce7ec3213fcf4d9ce8879c", + "element_id": "ae86527b9b053129da62dcb5ed3c8aec", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -882,7 +846,7 @@ "text": "major trading partner economies, and in Brazil, greater-than-expected fiscal support. Growth in the region is projected to rise to 2.1 percent in 2024, although with a downward revision of 0.3 percentage point, reflecting tighter financial conditions, lower prices of exported commodities, and downward revisions to trading partner growth." }, { - "type": "ListItem", + "type": "NarrativeText", "element_id": "25e2f1dc031b5421b8a234945098e58b", "metadata": { "data_source": { @@ -919,7 +883,7 @@ }, { "type": "NarrativeText", - "element_id": "d24af8f44bd419665bb4ab6efef34fed", + "element_id": "4e5a8cc0fcd53f25fe8e41091f016f50", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -937,7 +901,7 @@ }, { "type": "NarrativeText", - "element_id": "72d289ea524eebcd8f195a8afda1c223", + "element_id": "d93c27500d6e6fc5e73a1e35fe0a36ff", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -973,7 +937,7 @@ }, { "type": "NarrativeText", - "element_id": "818b1bd0fa9714f9ce4623897ba422a8", + "element_id": "ed6efde0729a7a59cfd24802fa6edb51", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1027,7 +991,7 @@ }, { "type": "Title", - "element_id": "1ad611b76683e54171ae0b1fddd827ca", + "element_id": "4547a3f05b931a26f1cfa16dba70e121", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1099,7 +1063,7 @@ }, { "type": "Table", - "element_id": "be8ce76d10d977dedf04ead323168e3a", + "element_id": "429d7ccdab398bfb2107fa00f9054272", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1113,7 +1077,7 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "WEO Projections 1/ Estimate2022 Projections 2023 Estimate2022 Projections 2023 2021 2024 2023 2024 2024 6.2 3.4 2.9 3.1 0.2 –0.1 1.9 3.2 3.0 Advanced Economies United States Euro Area 5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3 2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8 1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0 1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4 0.1 0.4 0.2 0.4 0.0 0.8 –0.1 0.2 –0.9 0.0 –0.3 –0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2 1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4 1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 –0.5 1.2 2.1 1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2 Germany France Italy Spain Japan United Kingdom Canada Other Advanced Economies 3/ Emerging Market and Developing Economies Emerging and Developing Asia 6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9 3.9 4.3 3.0 6.8 0.7 –2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6 4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2 4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3 0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 –0.4 –1.1 0.1 0.2 0.1 –0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0 2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0 5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5 4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8 China India 4/ Emerging and Developing Europe Russia Latin America and the Caribbean Brazil Mexico Middle East and Central Asia Saudi Arabia Sub-Saharan Africa Nigeria South Africa Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries 6.0 5.5 3.8 4.1 7.0 4.1 3.1 3.7 5.2 5.4 3.8 4.9 2.4 0.7 4.3 3.2 4.0 4.9 2.5 1.8 4.7 3.5 4.1 5.6 0.3 0.0 –0.2 –0.4 0.4 0.0 –0.1 –0.3 –0.2 0.2 0.0 0.1 1.7 1.8 3.7 . . . 2.5 . . . 2.5 1.2 5.7 . . . 5.0 . . . 2.5 2.0 4.0 . . . 4.1 . . . 10.4 9.4 12.1 5.4 6.6 3.4 2.4 2.3 2.6 3.4 2.7 4.6 –0.1 0.0 –0.3 –0.3 –0.4 0.0 . . . . . . . . . . . . . . . . . . . . . . . . . . . 65.8 26.4 39.8 7.0 –16.2 –6.3 –7.1 –0.4 –3.3 –0.1 –0.9 0.3 11.2 –2.0 –9.8 1.4 –5.9 –0.2" + "text": "WEO Projections 1/ Estimate 2022 Projections 2023 Estimate 2022 Projections 2023 2021 2024 2023 2024 2024 6.2 3.4 2.9 3.1 0.2 –0.1 1.9 3.2 3.0 Advanced Economies United States Euro Area 5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3 2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8 1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0 1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4 0.1 0.4 0.2 0.4 0.0 0.8 –0.1 0.2 –0.9 0.0 –0.3 –0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2 1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4 1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 –0.5 1.2 2.1 1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2 Germany France Italy Spain Japan United Kingdom Canada Other Advanced Economies 3/ Emerging Market and Developing Economies Emerging and Developing Asia 6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9 3.9 4.3 3.0 6.8 0.7 –2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6 4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2 4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3 0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 –0.4 –1.1 0.1 0.2 0.1 –0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0 2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0 5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5 4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8 China India 4/ Emerging and Developing Europe Russia Latin America and the Caribbean Brazil Mexico Middle East and Central Asia Saudi Arabia Sub-Saharan Africa Nigeria South Africa Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries 6.0 5.5 3.8 4.1 7.0 4.1 3.1 3.7 5.2 5.4 3.8 4.9 2.4 0.7 4.3 3.2 4.0 4.9 2.5 1.8 4.7 3.5 4.1 5.6 0.3 0.0 –0.2 –0.4 0.4 0.0 –0.1 –0.3 –0.2 0.2 0.0 0.1 1.7 1.8 3.7 . . . 2.5 . . . 2.5 1.2 5.7 . . . 5.0 . . . 2.5 2.0 4.0 . . . 4.1 . . . 10.4 9.4 12.1 5.4 6.6 3.4 2.4 2.3 2.6 3.4 2.7 4.6 –0.1 0.0 –0.3 –0.3 –0.4 0.0 . . . . . . . . . . . . . . . . . . . . . . . . . . . 65.8 26.4 39.8 7.0 –16.2 –6.3 –7.1 –0.4 –3.3 –0.1 –0.9 0.3 11.2 –2.0 –9.8 1.4 –5.9 –0.2" }, { "type": "Title", @@ -2755,7 +2719,7 @@ }, { "type": "NarrativeText", - "element_id": "df59a495ef85c5f70c5ba5356caf764a", + "element_id": "373341046b8ef33d588d6038817b98a0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2773,7 +2737,7 @@ }, { "type": "ListItem", - "element_id": "000425958dcafe9c9a9c501237d8c4d3", + "element_id": "eff8c9923ee635be60ad8c6d7e891f42", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2827,7 +2791,7 @@ }, { "type": "NarrativeText", - "element_id": "a6e6e147daf229e8267d85c3e49f7250", + "element_id": "75f95ac86db86ab3315a272430deda68", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2844,7 +2808,7 @@ "text": "However, the boost to demand could stoke core inflation, leading to even tighter monetary policies and a stronger-than-expected slowdown later on. Pent-up demand could also fuel a stronger rebound in China." }, { - "type": "ListItem", + "type": "UncategorizedText", "element_id": "2bbe57e6c291db638d3fcddca9e0199a", "metadata": { "data_source": { @@ -2863,7 +2827,7 @@ }, { "type": "ListItem", - "element_id": "a91232dce89744a5e3ea54c5a9d83110", + "element_id": "a5ef6992dc2c2b5b122b764e5f23d66a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2881,7 +2845,7 @@ }, { "type": "NarrativeText", - "element_id": "ab2ac0c0c558600b645acb6349ccf2df", + "element_id": "ddf9839e0b050a2c7b2e3a502e0be91c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2899,7 +2863,7 @@ }, { "type": "ListItem", - "element_id": "1bbcee85386321e6e8235a64d4c34d73", + "element_id": "925934587d21dc69b7040e5299c3957e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2917,7 +2881,7 @@ }, { "type": "ListItem", - "element_id": "4e2bc46d4988ddde43a4f295d1d458c2", + "element_id": "396c7bd25bf4d65da962d23adbb8cbe2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2934,7 +2898,7 @@ "text": "vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing price spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase." }, { - "type": "ListItem", + "type": "NarrativeText", "element_id": "2d14934d52ff357c52e9ae1c38f7390e", "metadata": { "data_source": { @@ -2953,7 +2917,7 @@ }, { "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", + "element_id": "bc6284f4d0f59f3cdadf10b2efc77c18", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -2967,29 +2931,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "" + "text": "Sudden financial market repricing: A premature easing in financial conditions in response to lower headline inflation data could complicate anti-inflation policies and necessitate additional monetary tightening. For the same reason, unfavorable inflation data releases could trigger sudden repricing of assets and increase volatility in financial markets. Such movements could strain liquidity and the functioning of critical markets, with ripple effects on the real economy." }, { - "type": "ListItem", - "element_id": "4ce40bcfac131ab024e535bf860f9495", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": " Sudden financial market repricing: A premature easing in financial conditions in response to lower headline inflation data could complicate anti-inflation policies and necessitate additional monetary tightening. For the same reason, unfavorable inflation data releases could trigger sudden repricing of assets and increase volatility in financial markets. Such movements could strain liquidity and the functioning of critical markets, with ripple effects on the real economy." - }, - { - "type": "ListItem", - "element_id": "75bd22ee0ba778cc3a616ed0a9b42292", + "type": "NarrativeText", + "element_id": "d4f397f8b162452852c0a8fb898c5e8a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3003,11 +2949,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at  pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing" + "text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing" }, { "type": "ListItem", - "element_id": "07e548fa6deaf8131db26e2cad4f5ce8", + "element_id": "828061b3c51e703fa7f8a7d5fc8271b1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3061,7 +3007,7 @@ }, { "type": "NarrativeText", - "element_id": "6684fee3e3cd949ec59e7444a0c3fd0c", + "element_id": "fa5b96150a767439834d76af71b6f7a8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3097,7 +3043,7 @@ }, { "type": "NarrativeText", - "element_id": "1c464362698203e7245bdaf33c388a80", + "element_id": "91c5d881804324614a0f24e7d96ad42a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3115,7 +3061,7 @@ }, { "type": "NarrativeText", - "element_id": "d6138134f71f953a9da2083154e2629e", + "element_id": "55fff1c4b3f7a8ff06a6c46d21d66cd1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3133,7 +3079,7 @@ }, { "type": "NarrativeText", - "element_id": "2457fbbf5aa862b5a8b45d070f9114cb", + "element_id": "4e5bd723dd23dd1d479acbfcf2e5a9d5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3151,7 +3097,7 @@ }, { "type": "NarrativeText", - "element_id": "bcef6ce9e3d4c015db21955dc4f6ce42", + "element_id": "a4b8951a10e5e3d05b83227fcc4d02d8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3169,7 +3115,7 @@ }, { "type": "NarrativeText", - "element_id": "defb87cb8f10236768732a1e5fe9519f", + "element_id": "6a18bf042a98d772d5995a6140cc1a9c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3187,7 +3133,7 @@ }, { "type": "NarrativeText", - "element_id": "bda037ffd6adfee8afa08544ca03a391", + "element_id": "64e79c327a2e74933f25bd3e9caf09f5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3241,7 +3187,7 @@ }, { "type": "NarrativeText", - "element_id": "2e9a0eaddd75095d1bbb4fda6f2c4feb", + "element_id": "907d316caf83dc6066a2a29e7a671f7d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3259,7 +3205,7 @@ }, { "type": "NarrativeText", - "element_id": "da0ef04b13917f67583290e9ba57e375", + "element_id": "17e2197f8b893177f51752cab299e36f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3277,7 +3223,7 @@ }, { "type": "NarrativeText", - "element_id": "9b451c78081780087a0e1e67cc0eaa1d", + "element_id": "bc6f2a8f19639ee36e3298fa992b76e6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3295,7 +3241,7 @@ }, { "type": "ListItem", - "element_id": "38ddb95e69fa17a6f9ccb3d04033fee2", + "element_id": "b411a2a97519dabf36abc764adda53dc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3313,7 +3259,7 @@ }, { "type": "NarrativeText", - "element_id": "add6f9f296b6a99cf0ef86162b3c9cfc", + "element_id": "5dff888d3e9f315d88116aa3660686f9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3327,11 +3273,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential.  Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes." + "text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential. Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes." }, { "type": "ListItem", - "element_id": "9b84ed98ce1c2e518bf677e6be62ac03", + "element_id": "3d668f85b6a6bd19156d78c62f740006", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3349,7 +3295,7 @@ }, { "type": "ListItem", - "element_id": "a5751b5964fbbc37b14db4811aeb37f4", + "element_id": "950260886a2d8eb808263bc73ec898f1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3363,11 +3309,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": " Using the global financial safety net: With the cascading of shocks to the global economy, using the global financial safety net to its fullest extent is appropriate, including by proactively utilizing the IMF’s precautionary financial arrangements and channeling aid from the international community to low-income countries facing shocks." + "text": "Using the global financial safety net: With the cascading of shocks to the global economy, using the global financial safety net to its fullest extent is appropriate, including by proactively utilizing the IMF’s precautionary financial arrangements and channeling aid from the international community to low-income countries facing shocks." }, { "type": "ListItem", - "element_id": "57a97a0ecd83f391b810800368a1dc27", + "element_id": "556c04bb61f902927d202956f7d2d6fd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3421,7 +3367,7 @@ }, { "type": "NarrativeText", - "element_id": "a2fa3a13e51ab7dd0859ee2c869b70e5", + "element_id": "a07efcab5056130c10048443a2bf8a3a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3439,7 +3385,7 @@ }, { "type": "NarrativeText", - "element_id": "f79a09409db68af141e82d9ac113ded8", + "element_id": "9875c08b39a4905c52bef432c042c0bb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3457,7 +3403,7 @@ }, { "type": "Image", - "element_id": "332963f26d8a6ec6c59d02201966d327", + "element_id": "5ad3e97ac0a2d759e059893765b81954", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3471,11 +3417,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "United StatesEuro areaChinaOther AEsOther EMs 7 October2022 GFSR 6 5 4 3 2 1 0 –1 –2 –3 2006 0808 10 10 12 12 14 16 14 18 18 20 2222 06 16 20" + "text": "United States Euro area China Other AEs Other EMs 7 October 2022 GFSR 6 5 4 3 2 1 0 –1 –2 –3 2006 08 08 10 10 12 12 14 16 14 18 18 20 22 22 06 16 20 " }, { "type": "FigureCaption", - "element_id": "d78f392a386b26aa260548d71936abff", + "element_id": "3e5456c4e156292c6284a528b3d3fb0c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3529,7 +3475,7 @@ }, { "type": "Image", - "element_id": "7857926e06305cd67c3080b14e94d317", + "element_id": "5728dbbab19d146278a6a3387e8e40d5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3543,7 +3489,7 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Latest October 2022 GFSR 5 6 2. Euro area 1. United States 5 4 4 3 3 2 2 1 1 Oct.22 Apr.23 Oct.23 Dec.24 Dec.26 Oct.22 Apr.23 Oct.23 Dec.24 Dec.26" + "text": "Latest October 2022 GFSR 5 6 2. Euro area 1. United States 5 4 4 3 3 2 2 1 1 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 " }, { "type": "NarrativeText", @@ -3583,7 +3529,7 @@ }, { "type": "NarrativeText", - "element_id": "a404b982431c5d79e96fa2c0fdd1544d", + "element_id": "9aa84a204c906f24862e2e3326cac381", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3619,7 +3565,7 @@ }, { "type": "NarrativeText", - "element_id": "06d12185958a014c0c9d6afeab7426c2", + "element_id": "0e1a77d4edd2f7419db77d9cdff99551", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json index d0d496946a..fb02f7ba9a 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json @@ -19,7 +19,7 @@ }, { "type": "Title", - "element_id": "51174df4a3a78fe261885b1818b66876", + "element_id": "b0f30096c91b740f061fd09cc6b86f2b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -37,7 +37,7 @@ }, { "type": "NarrativeText", - "element_id": "e2b1006b190b699d597fdb0f1d73f8f9", + "element_id": "c56cacd540b7d88bfe08d824c0ced992", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -73,7 +73,7 @@ }, { "type": "Title", - "element_id": "2fa985d0a50e61b09ec22c447cc4b2c9", + "element_id": "d4d5f1410356bb0053eeff6a9d2e84ae", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -91,7 +91,7 @@ }, { "type": "NarrativeText", - "element_id": "1f4925fb064910ee923ccc1f6b20715b", + "element_id": "968d079b21338bb15f266e70ed001fed", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -109,7 +109,7 @@ }, { "type": "NarrativeText", - "element_id": "46385c950e7da4d8e588686a541335c2", + "element_id": "b454d477ce7af47a3cce791045b27fb7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -127,7 +127,7 @@ }, { "type": "NarrativeText", - "element_id": "ae77460bce2d3a52d823954ccb9c708f", + "element_id": "b00c97f0af489349e26f90166924530c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -145,7 +145,7 @@ }, { "type": "NarrativeText", - "element_id": "8e1e0570b2ba9211cc184c21a3ffbf90", + "element_id": "8ab9e34861e39e7de50648b982afd89e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -163,7 +163,7 @@ }, { "type": "NarrativeText", - "element_id": "c6d2fa859e6df9845dee4044d05ddbc5", + "element_id": "2ee5188001ff61cf0d0e40659ce7bc49", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -181,7 +181,7 @@ }, { "type": "NarrativeText", - "element_id": "e055395659c9e1aa4d5c0afb188e4a9e", + "element_id": "1d53fd350bd8dd3a38db6787b7ef77cf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -199,7 +199,7 @@ }, { "type": "NarrativeText", - "element_id": "33a2aba13d6b228d8d6f792f16caa684", + "element_id": "01b622a95350cc48bafc9165732d661a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -217,7 +217,7 @@ }, { "type": "ListItem", - "element_id": "9209d9a3c8ea19bed487dff9476428ee", + "element_id": "494d55b19020277b68f13daf5464a252", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -235,7 +235,7 @@ }, { "type": "ListItem", - "element_id": "ae74ee3ddcecd2ffb75672d469c80a0e", + "element_id": "d2bff80ca96af626923ef67c2a927f2f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -271,7 +271,7 @@ }, { "type": "Footer", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "0fc5165686190ca845407c03ad4572e8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -289,7 +289,7 @@ }, { "type": "Header", - "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", + "element_id": "5749fdd6b67e4204b3047ba33540bc87", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -307,7 +307,7 @@ }, { "type": "Title", - "element_id": "257fa04b9d79fc46da551d720411595a", + "element_id": "3b5a5220792fcbec0b59d2088bc4c9ab", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -325,7 +325,7 @@ }, { "type": "NarrativeText", - "element_id": "ca18f74506ddc1bca89179259f3ff4cb", + "element_id": "542bf86ba9bab1357e3aaa0b4ae0ff70", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -379,7 +379,7 @@ }, { "type": "Image", - "element_id": "adaa6130ff3fb4154048fc2c431ad232", + "element_id": "27a3cde643219ef7662f032684e06bd4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -393,7 +393,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": " Marine CSP 40,000 Solar PV 35,000 Geothermal 30,000 Wind Bioenergy 25,000 Hydro 20,000 Nuclear 15,000 Gas 10,000 Oil Coal 5,000 0" + "text": " Marine CSP 40,000 Solar PV 35,000 Geothermal 30,000 Wind Bioenergy 25,000 Hydro 20,000 Nuclear 15,000 Gas 10,000 Oil Coal 5,000 0 " }, { "type": "UncategorizedText", @@ -487,7 +487,7 @@ }, { "type": "FigureCaption", - "element_id": "578e73d091a9463a76ea7502a6a92503", + "element_id": "befa378e171bb64fdf68091abf3501bd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -505,7 +505,7 @@ }, { "type": "NarrativeText", - "element_id": "427b54db6e4b434f92954bc67db93473", + "element_id": "2ab7465d2e5fa24b6724907f968cd4aa", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -523,7 +523,7 @@ }, { "type": "NarrativeText", - "element_id": "92f6fd6a561b87154049d083b93b611d", + "element_id": "76b7731e68f6ef2b8958ea4f1ec351af", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -559,7 +559,7 @@ }, { "type": "Image", - "element_id": "08ecd96cc879b82950d1204ea4e7d6d9", + "element_id": "152a83b89c4b24f7f3db154d0c3ddc1b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -573,7 +573,7 @@ "filetype": "application/pdf", "page_number": 4 }, - "text": "30,000,000 High-carbon Low-carbon 25,000,000 20,000,000 15,000,000 10,000,000 5,000,000 0" + "text": "30,000,000 High-carbon Low-carbon 25,000,000 20,000,000 15,000,000 10,000,000 5,000,000 0 " }, { "type": "UncategorizedText", @@ -685,7 +685,7 @@ }, { "type": "FigureCaption", - "element_id": "aa04bda99d06997f39a4b613c2c62be5", + "element_id": "e8facf920827e7bb6c64b065223f7c1e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -703,7 +703,7 @@ }, { "type": "NarrativeText", - "element_id": "d841776bdfaae69274a3c8b898021653", + "element_id": "5f0bfed5240c4c0d50a2b4d2b56d8e6f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -721,7 +721,7 @@ }, { "type": "NarrativeText", - "element_id": "10a72512425bbe7a4cdd6529b0337d90", + "element_id": "a58efe5247845dc40b0a648f1761ad5c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -739,7 +739,7 @@ }, { "type": "NarrativeText", - "element_id": "030d3154a592248139651c5f8fbef1d5", + "element_id": "e81223a218fb419ef983253212ce7e22", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -757,7 +757,7 @@ }, { "type": "Footer", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "element_id": "70d25f2c1428def16804c3b346ee8d13", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -775,7 +775,7 @@ }, { "type": "Footer", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "0b0bd7ca2acebad288fe09c9d9595f1f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -793,7 +793,7 @@ }, { "type": "NarrativeText", - "element_id": "a53cecd93ffb9ec731b7974f1805e924", + "element_id": "6db8df46f6adb58be10bf8d88e53d42e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -811,7 +811,7 @@ }, { "type": "Title", - "element_id": "899a2c517ba69726f3808d66f442e439", + "element_id": "d2b22b470eb3ab5829c6e3efb55e49a4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -829,7 +829,7 @@ }, { "type": "NarrativeText", - "element_id": "a8c17b6aa3cad915f2f7e0126706c2f5", + "element_id": "fde5c605b758e800b43c8e5844d0eb39", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -847,7 +847,7 @@ }, { "type": "NarrativeText", - "element_id": "7562e707e991f1fb634fff41f2cae0e4", + "element_id": "c262b08f408059316ea2bfe106d4996b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -865,7 +865,7 @@ }, { "type": "NarrativeText", - "element_id": "1cde21cc10aa769a17ca11aa1e10823e", + "element_id": "45a4d31b0300260dec8f3f86ec2ba0ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -883,7 +883,7 @@ }, { "type": "NarrativeText", - "element_id": "af2424b7ec665072a2ee0bdcd901e244", + "element_id": "c550d4a50eaa40717f1d857bce491a81", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -919,7 +919,7 @@ }, { "type": "Image", - "element_id": "82ef36cba07b18d12e76b25316a913ad", + "element_id": "196f551acb55f0373a9d7fac6c9dbeab", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -933,11 +933,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "300 250 200 150 100 50 0 O nshore Wind Offshore Wind N uclear m ercialPhotovoltaic C oal C C G T C o m" + "text": "300 250 200 150 100 50 0 O nshore Wind Offshore Wind N uclear m ercial Photovoltaic C oal C C G T C o m " }, { "type": "FigureCaption", - "element_id": "a5846cd18e790db780cc03f9e5f63278", + "element_id": "c23c8bcefbe3c4d1e5b55df29b717fc1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -955,7 +955,7 @@ }, { "type": "NarrativeText", - "element_id": "9ad4cf48d0b9d0bbfd257214f3d050dd", + "element_id": "2e16c4200e350d951a1911e164cc7a7d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -973,7 +973,7 @@ }, { "type": "NarrativeText", - "element_id": "13ff2375260e277c2dfbc8826aa50a65", + "element_id": "5b702c79a0deb88609b6a9b76a8ff4b1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -991,7 +991,7 @@ }, { "type": "NarrativeText", - "element_id": "4b3dad9b769c100e89b2c082e7d9e13e", + "element_id": "34e41476b3562e66cc04edd8a2ba4eb7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1009,7 +1009,7 @@ }, { "type": "NarrativeText", - "element_id": "0ce74aa5e786157de72d5ae801d86cc4", + "element_id": "5e1e7b461ccf41232b3daf5b54d59399", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1027,7 +1027,7 @@ }, { "type": "Footer", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "element_id": "aec400e3e65dc09b31513694bc9893b9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1045,7 +1045,7 @@ }, { "type": "Footer", - "element_id": "06e9d52c1720fca412803e3b07c4b228", + "element_id": "29399af043bbf069ecfd1abdcaee4b15", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1063,7 +1063,7 @@ }, { "type": "NarrativeText", - "element_id": "2cf9c478a20b21f5792941a179d926e9", + "element_id": "2fb9de7ce072904e3da50fc724ce8b12", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1099,7 +1099,7 @@ }, { "type": "Image", - "element_id": "0f2d0fcb85c227ec422bc38a9902a394", + "element_id": "577fd212dac38df299e478d6b7ce5d74", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1113,7 +1113,7 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "120 120 99.5 100 71.9 80 60 40 20 8.5 1.78 0.245 <0.01 0 Offshore wind O nshore wind(G erm any) C oal Oil N atural gas N uclear* S olar P V (U K)" + "text": "120 120 99.5 100 71.9 80 60 40 20 8.5 1.78 0.245 <0.01 0 Offshore wind O nshore wind (G erm any) C oal Oil N atural gas N uclear* S olar P V (U K) " }, { "type": "NarrativeText", @@ -1225,7 +1225,7 @@ }, { "type": "FigureCaption", - "element_id": "445676822969fb5177c0081d07449a70", + "element_id": "c12be3875fcb0e34681e80c68ced624f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1243,7 +1243,7 @@ }, { "type": "Title", - "element_id": "98d83a387e3ac2261daaf8d936bf3e27", + "element_id": "3472694e01617965f38369c536bdc070", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1261,7 +1261,7 @@ }, { "type": "NarrativeText", - "element_id": "1119369ba9a68924c64155762de72d8e", + "element_id": "49b5bd9868a164cf35b10cbb343ddba0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1315,7 +1315,7 @@ }, { "type": "Image", - "element_id": "7e02da28a2dd800555ed667258895ebc", + "element_id": "1459b67becac6e70efecfcbc9312d3f0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1329,7 +1329,7 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": " Coal 90 Gas/Oil 80 Biofuels/Waste Wind/Solar 70 Hydro 60 Nuclear 50 40 30 20 10" + "text": " Coal 90 Gas/Oil 80 Biofuels/Waste Wind/Solar 70 Hydro 60 Nuclear 50 40 30 20 10 " }, { "type": "UncategorizedText", @@ -1369,7 +1369,7 @@ }, { "type": "FigureCaption", - "element_id": "853637136575897a73cba3c5fb085e8c", + "element_id": "e720e656050a5b95706b7ffefb3ff505", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1387,7 +1387,7 @@ }, { "type": "FigureCaption", - "element_id": "2275583196d791405892aaca0d87743c", + "element_id": "0b7637d5dd3c0e09980fb400ebbdcf72", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1405,7 +1405,7 @@ }, { "type": "FigureCaption", - "element_id": "fd1b6d076800203a708efab109d9393a", + "element_id": "f18421197d4b12d362e1ededa3f3145f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1423,7 +1423,7 @@ }, { "type": "Image", - "element_id": "bd278705b60b07b012155a5883b6c09b", + "element_id": "ff963f0df99d82f7c343649121217117", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1437,7 +1437,7 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "600 Non-hydro 500 ren. & waste Nuclear 400 Natural gas 300 Hydro Oil 200 Coal 100 0" + "text": "600 Non-hydro 500 ren. & waste Nuclear 400 Natural gas 300 Hydro Oil 200 Coal 100 0 " }, { "type": "Title", @@ -1459,7 +1459,7 @@ }, { "type": "FigureCaption", - "element_id": "ff8db11f410c00860c60393cc143175f", + "element_id": "3d193aba53b016527c3f658cdd6d99e2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1477,7 +1477,7 @@ }, { "type": "FigureCaption", - "element_id": "3b5b3755bac62d7f53eb84cadc34c528", + "element_id": "13d3d626e6be5671aed83c1270851087", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1495,7 +1495,7 @@ }, { "type": "NarrativeText", - "element_id": "4f5cc927b953f3c49c562a22c88f863f", + "element_id": "c671a138a73229210d751a2857ff503d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1513,7 +1513,7 @@ }, { "type": "Image", - "element_id": "36ca9b7cdbbcba729a46487cf86c07eb", + "element_id": "2f701c7144dd1588cdb70e2a188c1418", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1527,11 +1527,11 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "One fuel pellet contains as much energy as a tonne of coal" + "text": "One fuel pellet contains as much energy as a tonne of coal " }, { "type": "NarrativeText", - "element_id": "0e28734a89e6f2473c6bbd5c1bdaf50e", + "element_id": "a0fef2750bd4fbae8e9e28211ceea788", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1549,7 +1549,7 @@ }, { "type": "NarrativeText", - "element_id": "81a65c45b597c6647c9f984f7b2e3554", + "element_id": "37eafc6215975a858fa506f14ea98982", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1567,7 +1567,7 @@ }, { "type": "Header", - "element_id": "7902699be42c8a8e46fbbb4501726517", + "element_id": "c69b33366ea0bcfb6c30799a4100c6a0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1585,7 +1585,7 @@ }, { "type": "Footer", - "element_id": "aa67a169b0bba217aa0aa88a65346920", + "element_id": "671a02117af87c2371462f800d856f15", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1603,7 +1603,7 @@ }, { "type": "NarrativeText", - "element_id": "4c23c5c4e459d5f3f6f62cc6a06a816a", + "element_id": "c948e12a05e40ac00f3e5321f6ae7742", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1621,7 +1621,7 @@ }, { "type": "NarrativeText", - "element_id": "cd055b546424c5003939bb047a56abf0", + "element_id": "0748d61bf0b14f9f30e6cfbfb57034d4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1639,7 +1639,7 @@ }, { "type": "NarrativeText", - "element_id": "a654080ea22f70c397bca52fee82b82f", + "element_id": "395face9e1f924d1f2a0d746f317c5b9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1657,7 +1657,7 @@ }, { "type": "Title", - "element_id": "e56261e0bd30965b8e68ed2abb15b141", + "element_id": "a0d7deccf89e42d02a9d66b0c1889689", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1855,7 +1855,7 @@ }, { "type": "ListItem", - "element_id": "5986cde0b872e4b1253cf1f5e82360b2", + "element_id": "dcf0fcce0dd00a5335e8e42658aacc75", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1909,7 +1909,7 @@ }, { "type": "ListItem", - "element_id": "2ac3e029f2ae0ed36a9af34bd225e889", + "element_id": "0b9ea5c0804f5a369317ffcf363badf3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1927,7 +1927,7 @@ }, { "type": "NarrativeText", - "element_id": "ab9c4428d3394fd230d7636bea5030d5", + "element_id": "c2fb0a6722612bb6055e56fea799a81b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1945,7 +1945,7 @@ }, { "type": "NarrativeText", - "element_id": "821daa4396c0087d9d5ee9240bc5c85c", + "element_id": "8ff63a0f4af4de37eff90952d575f76d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1963,7 +1963,7 @@ }, { "type": "NarrativeText", - "element_id": "c48603fd38d3449d3afcd2dc18903083", + "element_id": "06d4880e4a23a9520618a50bfbbdf940", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1981,7 +1981,7 @@ }, { "type": "NarrativeText", - "element_id": "705da4db5e220010ddfd03d9452855e4", + "element_id": "4ac69378210b56e0c98be2d41e374769", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -1999,7 +1999,7 @@ }, { "type": "NarrativeText", - "element_id": "de49f1c955d7c8a4d1d6d261c1cf21ba", + "element_id": "3df4f87a759f8e1a2e3ab0a186ac16ef", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json index 3ab10c83c2..adc4410d50 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json @@ -19,7 +19,7 @@ }, { "type": "Title", - "element_id": "d72f07e2c764ae90417305db928ebce1", + "element_id": "61a1e4ec9d7a8140a78bbb7450ec65ca", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -37,7 +37,7 @@ }, { "type": "NarrativeText", - "element_id": "c875f7e098e5ea1b337a189c28e80ac3", + "element_id": "ad3db440731a2892ac234c2ee7bb5b04", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -55,7 +55,7 @@ }, { "type": "NarrativeText", - "element_id": "327be60d66a34747047e1365e6bab727", + "element_id": "f36c8656d6b853a5728f6f17c29706f3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -73,7 +73,7 @@ }, { "type": "NarrativeText", - "element_id": "0e7b344a22dd76ce94588c537d418717", + "element_id": "a23e0b3be89e2c3a3088fdbddfa3bcb1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -91,7 +91,7 @@ }, { "type": "Title", - "element_id": "2fa985d0a50e61b09ec22c447cc4b2c9", + "element_id": "d4d5f1410356bb0053eeff6a9d2e84ae", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -109,7 +109,7 @@ }, { "type": "NarrativeText", - "element_id": "eeb2fd62f21f17d70b2c51f4857426fe", + "element_id": "19d767c50c1203bd1d3882353b464e73", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -127,7 +127,7 @@ }, { "type": "NarrativeText", - "element_id": "3689b86ea677b25a3ce9586c4be41a46", + "element_id": "ffcf7b0ae159cc2a1bc0a07e3618454a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -145,7 +145,7 @@ }, { "type": "NarrativeText", - "element_id": "ee4ac543bf2035b86b6818e06e3a0a90", + "element_id": "c25840980b1b27ba0706d4319d6e3aa8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -163,7 +163,7 @@ }, { "type": "NarrativeText", - "element_id": "c89f871dfc13c4c4bcde1f9e241f17da", + "element_id": "0048cc9470dc1736e2fc654d7236a3ee", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -181,7 +181,7 @@ }, { "type": "NarrativeText", - "element_id": "f62c49fcf0a7960d0b509e37507d76d3", + "element_id": "676bbd12ba352a9d057d15cb8c925e36", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -199,7 +199,7 @@ }, { "type": "Footer", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "0fc5165686190ca845407c03ad4572e8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -217,7 +217,7 @@ }, { "type": "Footer", - "element_id": "53c234e5e8472b6ac51c1ae1cab3fe06", + "element_id": "5749fdd6b67e4204b3047ba33540bc87", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -235,7 +235,7 @@ }, { "type": "Title", - "element_id": "6b3149c1769f5cd200ec2a0017b936dc", + "element_id": "a6c3efab675abc38935ff01d2d8b8386", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -253,7 +253,7 @@ }, { "type": "NarrativeText", - "element_id": "ce5bcf6b4fe24d62bd24d156d5bc965e", + "element_id": "dbe6820b5750298e87712c37dfe97b7d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -307,7 +307,7 @@ }, { "type": "Table", - "element_id": "bd73364ecc77e30dd55e632e93e4583d", + "element_id": "9405da801e46d0da5f19ea801ff4ff51", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -557,42 +557,6 @@ }, "text": "4" }, - { - "type": "UncategorizedText", - "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "" - }, - { - "type": "UncategorizedText", - "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "" - }, { "type": "Title", "element_id": "1656c455012b016fbac5eac0a38397bd", @@ -647,42 +611,6 @@ }, "text": "17" }, - { - "type": "UncategorizedText", - "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "" - }, - { - "type": "UncategorizedText", - "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "" - }, { "type": "UncategorizedText", "element_id": "7902699be42c8a8e46fbbb4501726517", @@ -737,42 +665,6 @@ }, "text": "22" }, - { - "type": "UncategorizedText", - "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "" - }, - { - "type": "UncategorizedText", - "element_id": "d1429f8178a04f7fc73a66edf10ab8b5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "" - }, { "type": "UncategorizedText", "element_id": "b7a56873cd771f2c446d369b649430b6", @@ -829,7 +721,7 @@ }, { "type": "NarrativeText", - "element_id": "3cf0a9c5ad0cacc724f90abbe99664d9", + "element_id": "8c4b7bab8fb561388b19a37576e4665f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -847,7 +739,7 @@ }, { "type": "NarrativeText", - "element_id": "82cf60d4b6b58dd2d61b49884fceb83d", + "element_id": "cd51de1780ee75925a66b066fb7e4e01", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -865,7 +757,7 @@ }, { "type": "NarrativeText", - "element_id": "0d28f703c3b3aa9fee1f9f08fa688409", + "element_id": "80b622f9209254c3963bf7431da6a13a", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -883,7 +775,7 @@ }, { "type": "Image", - "element_id": "557e455200e568a1b8ce1fa205432b10", + "element_id": "ae4cbd98081526c1b5f7fbd30f47c869", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -897,11 +789,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Natural Artificial 48% Radon 14% Buildings & soil 12% Food & water 10% Cosmic 4% Thoron 11% Medicine 0.4% 0.4% Miscellaneous 0.2% Occupational 0.04% Nuclear discharges Fallout" + "text": "Natural Artificial 48% Radon 14% Buildings & soil 12% Food & water 10% Cosmic 4% Thoron 11% Medicine 0.4% 0.4% Miscellaneous 0.2% Occupational 0.04% Nuclear discharges Fallout " }, { "type": "FigureCaption", - "element_id": "9b657ab0d2ea482c887c7877ba86598d", + "element_id": "d6c864820d8af0aed00d45c41ca0691e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -919,7 +811,7 @@ }, { "type": "NarrativeText", - "element_id": "4469b98946c004fbae47ad6285c9bba4", + "element_id": "829f6e4c5ed9b4e900ab4a696e46687b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -937,7 +829,7 @@ }, { "type": "NarrativeText", - "element_id": "cbf390f564b0b1197deb5bf3dd999291", + "element_id": "062875b861d4d40027ab674686333587", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1063,7 +955,7 @@ }, { "type": "Image", - "element_id": "86c85866eb204cac66d78366332b5f42", + "element_id": "0bcb3759fa68b68d784c3c3963253c90", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1077,11 +969,11 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "25 20 18.4 15 10 4.6 5 2.8 0.07 0.04 0.02 0.01 0 C oal Oil Bio m ass N atural gas Wind H ydropo w er S olar N uclear" + "text": "25 20 18.4 15 10 4.6 5 2.8 0.07 0.04 0.02 0.01 0 C oal Oil Bio m ass N atural gas Wind H ydropo w er S olar N uclear " }, { "type": "FigureCaption", - "element_id": "8e44807922e69a38594c4b389cd0be54", + "element_id": "2c07d964a50db8baf3ed9db257827518", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1099,7 +991,7 @@ }, { "type": "NarrativeText", - "element_id": "bf88d949b16b32347c420a66fa413d49", + "element_id": "47ad83d77e60857060e4435724be6db6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1135,7 +1027,7 @@ }, { "type": "ListItem", - "element_id": "31138d5dc0c297144d27d5dbd15d5ef0", + "element_id": "30844d5faa0b85b758a56d22dd4c5048", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1153,7 +1045,7 @@ }, { "type": "Header", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "element_id": "70d25f2c1428def16804c3b346ee8d13", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1171,7 +1063,7 @@ }, { "type": "Footer", - "element_id": "7de1555df0c2700329e815b93b32c571", + "element_id": "0b0bd7ca2acebad288fe09c9d9595f1f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1189,7 +1081,7 @@ }, { "type": "Title", - "element_id": "b6812463b15ddda3f2402dfda95d2c86", + "element_id": "b3dc46f381163a6ba4304765c36bc32e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1207,7 +1099,7 @@ }, { "type": "NarrativeText", - "element_id": "ec0fb27e2a16f77899bf83591cd2d0de", + "element_id": "8678fa69494c8706052cd795f4f104b2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1225,7 +1117,7 @@ }, { "type": "NarrativeText", - "element_id": "d6bd9451ceee595c090d110656bb1b2b", + "element_id": "69172a66b601bc530a3d701869fe70ff", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1243,7 +1135,7 @@ }, { "type": "NarrativeText", - "element_id": "d8c68c0317a4a3867de201703e068e2e", + "element_id": "38bd099c0174246d8ac9b5316877b4cb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1261,7 +1153,7 @@ }, { "type": "NarrativeText", - "element_id": "e5dec03340d86adfd26612d5d06ab5e6", + "element_id": "006f701bd8073d16266a7877cc66ca8e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1279,7 +1171,7 @@ }, { "type": "Title", - "element_id": "3506b7d2b1626663985ae1a521a60fe1", + "element_id": "14652e5114533ec4e41bae34a9ae8508", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1297,7 +1189,7 @@ }, { "type": "NarrativeText", - "element_id": "00548dbd288df8370c39789adb302f50", + "element_id": "de64c96ca0e7c0888fe7ca31fae679bb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1315,7 +1207,7 @@ }, { "type": "NarrativeText", - "element_id": "07ed21008ec3f8801f7cbb1fc670d4db", + "element_id": "f20b489f0bdd9d6e52a9260366e15809", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1333,7 +1225,7 @@ }, { "type": "NarrativeText", - "element_id": "ba80f89ec0449fefee24b33fbb7e29b6", + "element_id": "cac91db6239b98e81af3b7d4657aba8e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1351,7 +1243,7 @@ }, { "type": "NarrativeText", - "element_id": "9e9ed8938e271667a9512898d2ca629b", + "element_id": "9480b681b55abea19f25da2d11c5a05f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1369,7 +1261,7 @@ }, { "type": "Image", - "element_id": "b1139c67550215b3f94886c9b2dc1ab5", + "element_id": "73ffa3745f99b6332d0ddfac674755c6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1383,11 +1275,11 @@ "filetype": "application/pdf", "page_number": 7 }, - "text": "Social andenvironmental costs ofemissions, land-use,climate change, securityof supply, etc. Plant-levelproduction costsat market prices Grid-level costsof the electricitysystem" + "text": "Social and environmental costs of emissions, land-use, climate change, security of supply, etc. Plant-level production costs at market prices Grid-level costs of the electricity system " }, { "type": "FigureCaption", - "element_id": "567f470fb4fb5c58b115fbe79a425970", + "element_id": "e36be3332e741c097797f63a8fd3707d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1405,7 +1297,7 @@ }, { "type": "NarrativeText", - "element_id": "6595e50969f899bd2fa05c0d7a8a682c", + "element_id": "af07e68b26fd9db43d499e55ca9e018f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1423,7 +1315,7 @@ }, { "type": "NarrativeText", - "element_id": "07958b72a8f6127e362d9ce84be7ea54", + "element_id": "256d31a451bb2c5a0d86a9e80a3ecacd", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1441,7 +1333,7 @@ }, { "type": "Header", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "element_id": "aec400e3e65dc09b31513694bc9893b9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1459,7 +1351,7 @@ }, { "type": "Header", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "element_id": "29399af043bbf069ecfd1abdcaee4b15", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1477,7 +1369,7 @@ }, { "type": "Title", - "element_id": "75ed57ac08703850c3e6aa55ac4aea97", + "element_id": "091660349a1a512762bb9380e9d14cf5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1495,7 +1387,7 @@ }, { "type": "NarrativeText", - "element_id": "7cb6cd150bb2cc2a0f10ba8584c285c7", + "element_id": "b864ba4245ef997f44c68d9d57c1fe29", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1513,7 +1405,7 @@ }, { "type": "NarrativeText", - "element_id": "5165336fa7f2d57e7fa5030f6b4f6a24", + "element_id": "f18dedc70c4506433490779a01898ae0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1531,7 +1423,7 @@ }, { "type": "FigureCaption", - "element_id": "29215d2c137a392941315c6c7a67e8fd", + "element_id": "49cf9714f5fae5188caca69778912b35", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1549,7 +1441,7 @@ }, { "type": "NarrativeText", - "element_id": "d754d8d468346f652657279272a11897", + "element_id": "6e611a47ebc4cf4fafb25fa9fd407396", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1567,7 +1459,7 @@ }, { "type": "NarrativeText", - "element_id": "0714f9ff88637006bdb76908c7c936bf", + "element_id": "5b6b71051007872635d4f529e167f04e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1585,7 +1477,7 @@ }, { "type": "NarrativeText", - "element_id": "f62c49fcf0a7960d0b509e37507d76d3", + "element_id": "676bbd12ba352a9d057d15cb8c925e36", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1603,7 +1495,7 @@ }, { "type": "Footer", - "element_id": "7902699be42c8a8e46fbbb4501726517", + "element_id": "c69b33366ea0bcfb6c30799a4100c6a0", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1621,7 +1513,7 @@ }, { "type": "Header", - "element_id": "2c624232cdd221771294dfbb310aca00", + "element_id": "671a02117af87c2371462f800d856f15", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1639,7 +1531,7 @@ }, { "type": "Title", - "element_id": "e56261e0bd30965b8e68ed2abb15b141", + "element_id": "a0d7deccf89e42d02a9d66b0c1889689", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1657,7 +1549,7 @@ }, { "type": "ListItem", - "element_id": "c06ac75f019ceac1ff2baecfc090fd3e", + "element_id": "5389164ad375b2831f97d751f5bdb4e6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1693,7 +1585,7 @@ }, { "type": "ListItem", - "element_id": "199440a0821e16b612f4697aa2306cb2", + "element_id": "f60b349ce3d55412b0f4e4a6b658c5ca", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1729,7 +1621,7 @@ }, { "type": "ListItem", - "element_id": "18b2cdcbf43cbcab942c6ffa69abdc51", + "element_id": "c5abfd8856e08e4a6d884b3e2bf860f7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1747,7 +1639,7 @@ }, { "type": "ListItem", - "element_id": "0d47ae52e5f061cfc5048ddcaba403d4", + "element_id": "72e939a5bd164f925c3d0aeb3d9a6af8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1761,11 +1653,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "iv United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/UNSCEAR_2016_GA-Report-CORR.pdf" + "text": "iv United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf" }, { "type": "ListItem", - "element_id": "81be06e67a1b533cb1278b15860c51db", + "element_id": "add0a2858326fca8ba9f6f4e6bbbdb28", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1819,7 +1711,7 @@ }, { "type": "ListItem", - "element_id": "69bd2cd5a46ac8850a9e3ea2df80de60", + "element_id": "3750db60e990408fd944ea48886461b2", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1837,7 +1729,7 @@ }, { "type": "ListItem", - "element_id": "15e80c04027ef832c3b1390cc65e4bd3", + "element_id": "933612b1c8ac73db347d57c4c1006b30", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1851,11 +1743,11 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/data/themes/public-health-and-environment [Accessed on 8 April 2021]" + "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]" }, { "type": "ListItem", - "element_id": "cfe3779da861867bff1504ddefb25de7", + "element_id": "a0acb863356551c514d2a230c16499bc", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1873,7 +1765,7 @@ }, { "type": "ListItem", - "element_id": "dd9a5a9cddd215a320cef8faba067a29", + "element_id": "bbea23b1fb74c1188b13acbf2b55c077", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1891,7 +1783,7 @@ }, { "type": "ListItem", - "element_id": "406c6ad54b798573c5e610cb96d3d7e1", + "element_id": "c0ff9c6926628d5d8e0318ebdf439444", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1909,7 +1801,7 @@ }, { "type": "ListItem", - "element_id": "5f515ae66188ea42830eaf540f4f0c12", + "element_id": "4fd3863356260022a18a3c6932cbf1bf", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1927,7 +1819,7 @@ }, { "type": "ListItem", - "element_id": "af64bcc9f6d36d2c339a592dc2ae75ff", + "element_id": "1646468e15668c053f076342c361e1a9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1945,7 +1837,7 @@ }, { "type": "NarrativeText", - "element_id": "10407d498f2636f50597e71d97cc001a", + "element_id": "d36f85976820f1619b8be0e2e65d011d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1963,7 +1855,7 @@ }, { "type": "NarrativeText", - "element_id": "821daa4396c0087d9d5ee9240bc5c85c", + "element_id": "8ff63a0f4af4de37eff90952d575f76d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1981,7 +1873,7 @@ }, { "type": "NarrativeText", - "element_id": "c48603fd38d3449d3afcd2dc18903083", + "element_id": "06d4880e4a23a9520618a50bfbbdf940", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1999,7 +1891,7 @@ }, { "type": "NarrativeText", - "element_id": "705da4db5e220010ddfd03d9452855e4", + "element_id": "4ac69378210b56e0c98be2d41e374769", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2017,7 +1909,7 @@ }, { "type": "NarrativeText", - "element_id": "fc5faebaec5a1349ce932f1863bdd842", + "element_id": "e0366cbb8cc558e39e0ef80e08f603c1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 8e2090a532..0b786b2b93 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -7,7 +7,7 @@ import os import re import zipfile -from typing import IO, Any, Callable, Dict, List, Optional +from typing import IO, Any, Callable, Dict, List, Optional, cast from typing_extensions import ParamSpec @@ -562,7 +562,9 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]: metadata_kwargs = { kwarg: params.get(kwarg) for kwarg in ("filename", "url", "text_as_html") } - elements = set_element_hierarchy(elements) + if not cast(str, kwargs.get("model_name", "")).startswith("chipper"): + # NOTE(alan): Skip hierarchy if using chipper, as it should take care of that + elements = set_element_hierarchy(elements) for element in elements: # NOTE(robinson) - Attached files have already run through this logic diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 738a293c39..8d41854ef2 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -36,9 +36,10 @@ from unstructured.logger import logger from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE from unstructured.partition.utils.constants import ( + SORT_MODE_DONT, SORT_MODE_XY_CUT, ) -from unstructured.utils import dependency_exists +from unstructured.utils import dependency_exists, first if dependency_exists("docx") and dependency_exists("docx.table"): from docx.table import Table as docxtable @@ -51,10 +52,8 @@ if TYPE_CHECKING: from unstructured_inference.inference.layout import DocumentLayout, PageLayout - from unstructured_inference.inference.layoutelement import ( - LayoutElement, - LocationlessLayoutElement, - ) + from unstructured_inference.inference.layoutelement import LayoutElement + HIERARCHY_RULE_SET = { "Title": [ @@ -105,7 +104,6 @@ def get_last_modified_date_from_file( def normalize_layout_element( layout_element: Union[ "LayoutElement", - "LocationlessLayoutElement", Element, Dict[str, Any], ], @@ -239,6 +237,8 @@ def set_element_hierarchy( """ stack: List[Element] = [] for element in elements: + if element.metadata.parent_id is not None: + continue parent_id = None element_category = getattr(element, "category", None) element_category_depth = getattr(element.metadata, "category_depth", 0) or 0 @@ -566,8 +566,9 @@ def document_to_element_list( image_width = page_image_metadata.get("width") image_height = page_image_metadata.get("height") + translation_mapping: List[Tuple["LayoutElement", Element]] = [] for layout_element in page.elements: - if image_width and image_height and hasattr(layout_element, "coordinates"): + if image_width and image_height and hasattr(layout_element.bbox, "coordinates"): coordinate_system = PixelSpace(width=image_width, height=image_height) else: coordinate_system = None @@ -584,6 +585,7 @@ def document_to_element_list( el.metadata.last_modified = last_modification_date el.metadata.page_number = i + 1 page_elements.extend(element) + translation_mapping.extend([(layout_element, el) for el in element]) continue else: if last_modification_date: @@ -600,6 +602,7 @@ def document_to_element_list( logger.info("HTML element instance has no attribute type") page_elements.append(element) + translation_mapping.append((layout_element, element)) coordinates = ( element.metadata.coordinates.points if element.metadata.coordinates else None ) @@ -620,8 +623,14 @@ def document_to_element_list( **kwargs, ) + for layout_element, element in translation_mapping: + if hasattr(layout_element, "parent") and layout_element.parent is not None: + element_parent = first( + (el for l_el, el in translation_mapping if l_el is layout_element.parent), + ) + element.metadata.parent_id = element_parent.id sorted_page_elements = page_elements - if sortable and sort_mode == SORT_MODE_XY_CUT: + if sortable and sort_mode != SORT_MODE_DONT: sorted_page_elements = sort_page_elements(page_elements, sort_mode) if include_page_breaks and i < num_pages - 1: diff --git a/unstructured/partition/ocr.py b/unstructured/partition/ocr.py index 4208e76d3d..b2e1b0c00e 100644 --- a/unstructured/partition/ocr.py +++ b/unstructured/partition/ocr.py @@ -11,14 +11,11 @@ # unstructured.documents.elements.Image from PIL import Image as PILImage from PIL import ImageSequence -from unstructured_inference.inference.elements import ( - Rectangle, - TextRegion, - partition_groups_from_regions, -) +from unstructured_inference.inference.elements import TextRegion from unstructured_inference.inference.layout import DocumentLayout, PageLayout from unstructured_inference.inference.layoutelement import ( LayoutElement, + partition_groups_from_regions, ) from unstructured_pytesseract import Output @@ -192,7 +189,12 @@ def supplement_page_layout_with_ocr( if element.text == "": padded_element = pad_element_bboxes(element, padding=12) cropped_image = image.crop( - (padded_element.x1, padded_element.y1, padded_element.x2, padded_element.y2), + ( + padded_element.bbox.x1, + padded_element.bbox.y1, + padded_element.bbox.x2, + padded_element.bbox.y2, + ), ) text_from_ocr = get_ocr_text_from_image( cropped_image, @@ -216,10 +218,10 @@ def pad_element_bboxes( boxes of the element by extending the boundary outward (resp. inward)""" out_element = deepcopy(element) - out_element.x1 -= padding - out_element.x2 += padding - out_element.y1 -= padding - out_element.y2 += padding + out_element.bbox.x1 -= padding + out_element.bbox.x2 += padding + out_element.bbox.y1 -= padding + out_element.bbox.y2 += padding return out_element @@ -314,7 +316,7 @@ def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]: (x1, y1, x2, y2) = l, t, l + w, t + h text = ocr_data["text"][i] if text: - text_region = TextRegion(x1, y1, x2, y2, text=text, source="OCR-tesseract") + text_region = TextRegion.from_coords(x1, y1, x2, y2, text=text, source="OCR-tesseract") text_regions.append(text_region) return text_regions @@ -350,7 +352,7 @@ def parse_ocr_data_paddle(ocr_data: list) -> List[TextRegion]: y2 = max([i[1] for i in line[0]]) text = line[1][0] if text: - text_region = TextRegion(x1, y1, x2, y2, text, source="OCR-paddle") + text_region = TextRegion.from_coords(x1, y1, x2, y2, text, source="OCR-paddle") text_regions.append(text_region) return text_regions @@ -399,8 +401,8 @@ def aggregate_ocr_text_by_block( extracted_texts = [] for ocr_region in ocr_layout: - ocr_region_is_subregion_of_given_region = ocr_region.is_almost_subregion_of( - region, + ocr_region_is_subregion_of_given_region = ocr_region.bbox.is_almost_subregion_of( + region.bbox, subregion_threshold=subregion_threshold, ) if ocr_region_is_subregion_of_given_region and ocr_region.text: @@ -442,8 +444,8 @@ def supplement_layout_with_ocr_elements( ocr_regions_to_remove = [] for ocr_region in ocr_layout: for el in layout: - ocr_region_is_subregion_of_out_el = ocr_region.is_almost_subregion_of( - cast(Rectangle, el), + ocr_region_is_subregion_of_out_el = ocr_region.bbox.is_almost_subregion_of( + el.bbox, SUBREGION_THRESHOLD_FOR_OCR, ) if ocr_region_is_subregion_of_out_el: @@ -471,15 +473,7 @@ def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutE ) merged_regions = [merge_text_regions(group) for group in grouped_regions] return [ - LayoutElement( - r.x1, - r.y1, - r.x2, - r.y2, - text=r.text, - source=r.source, - type="UncategorizedText", - ) + LayoutElement(text=r.text, source=r.source, type="UncategorizedText", bbox=r.bbox) for r in merged_regions ] @@ -495,11 +489,11 @@ def merge_text_regions(regions: List[TextRegion]) -> TextRegion: - TextRegion: A single merged TextRegion object. """ - min_x1 = min([tr.x1 for tr in regions]) - min_y1 = min([tr.y1 for tr in regions]) - max_x2 = max([tr.x2 for tr in regions]) - max_y2 = max([tr.y2 for tr in regions]) + min_x1 = min([tr.bbox.x1 for tr in regions]) + min_y1 = min([tr.bbox.y1 for tr in regions]) + max_x2 = max([tr.bbox.x2 for tr in regions]) + max_y2 = max([tr.bbox.y2 for tr in regions]) merged_text = " ".join([tr.text for tr in regions if tr.text]) - return TextRegion(min_x1, min_y1, max_x2, max_y2, merged_text) + return TextRegion.from_coords(min_x1, min_y1, max_x2, max_y2, merged_text) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 058c21e0a4..1d68edd0a3 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -60,7 +60,12 @@ ) from unstructured.partition.strategies import determine_pdf_or_image_strategy from unstructured.partition.text import element_from_text, partition_text -from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT, OCRMode +from unstructured.partition.utils.constants import ( + SORT_MODE_BASIC, + SORT_MODE_DONT, + SORT_MODE_XY_CUT, + OCRMode, +) from unstructured.partition.utils.sorting import ( coord_has_valid_points, sort_page_elements, @@ -377,14 +382,18 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, **process_with_model_kwargs, ) - final_layout = process_file_with_ocr( - filename, - out_layout, - is_image=is_image, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, - pdf_image_dpi=pdf_image_dpi, - ) + if model_name.startswith("chipper"): + # NOTE(alan): We shouldn't do OCR with chipper + final_layout = out_layout + else: + final_layout = process_file_with_ocr( + filename, + out_layout, + is_image=is_image, + ocr_languages=ocr_languages, + ocr_mode=ocr_mode, + pdf_image_dpi=pdf_image_dpi, + ) else: out_layout = process_data_with_model( file, @@ -394,16 +403,24 @@ def _partition_pdf_or_image_local( pdf_image_dpi=pdf_image_dpi, **process_with_model_kwargs, ) - if hasattr(file, "seek"): - file.seek(0) - final_layout = process_data_with_ocr( - file, - out_layout, - is_image=is_image, - ocr_languages=ocr_languages, - ocr_mode=ocr_mode, - pdf_image_dpi=pdf_image_dpi, - ) + if model_name.startswith("chipper"): + # NOTE(alan): We shouldn't do OCR with chipper + final_layout = out_layout + else: + if hasattr(file, "seek"): + file.seek(0) + final_layout = process_data_with_ocr( + file, + out_layout, + is_image=is_image, + ocr_languages=ocr_languages, + ocr_mode=ocr_mode, + pdf_image_dpi=pdf_image_dpi, + ) + + # NOTE(alan): starting with v2, chipper sorts the elements itself. + if model_name == "chipper": + kwargs["sort_mode"] = SORT_MODE_DONT elements = document_to_element_list( final_layout, @@ -438,7 +455,9 @@ def _partition_pdf_or_image_local( " ", el.text or "", ).strip() - if el.text or isinstance(el, PageBreak): + # NOTE(alan): with chipper there are parent elements with no text we don't want to + # filter those out and leave the children orphaned. + if el.text or isinstance(el, PageBreak) or model_name.startswith("chipper"): out_elements.append(cast(Element, el)) return out_elements diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index ceb3ab9605..a95fa21406 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -9,6 +9,7 @@ class OCRMode(Enum): SORT_MODE_XY_CUT = "xy-cut" SORT_MODE_BASIC = "basic" +SORT_MODE_DONT = "dont" SUBREGION_THRESHOLD_FOR_OCR = 0.5 UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False) diff --git a/unstructured/utils.py b/unstructured/utils.py index d0dc772be6..e57549c2fb 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -6,7 +6,20 @@ import subprocess from datetime import datetime from functools import wraps -from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union, cast +from typing import ( + Any, + Callable, + Dict, + Generic, + Iterable, + Iterator, + List, + Optional, + Tuple, + TypeVar, + Union, + cast, +) import requests from typing_extensions import ParamSpec @@ -197,6 +210,35 @@ def validate_date_args(date: Optional[str] = None): ) +def _first_and_remaining_iterator(it: Iterable) -> Tuple[Any, Iterator]: + iterator = iter(it) + try: + out = next(iterator) + except StopIteration: + raise ValueError( + "Expected at least 1 element in iterable from which to retrieve first, got empty " + "iterable.", + ) + return out, iterator + + +def first(it: Iterable) -> Any: + """Returns the first item from an iterable. Raises an error if the iterable is empty.""" + out, _ = _first_and_remaining_iterator(it) + return out + + +def only(it: Iterable) -> Any: + """Returns the only element from a singleton iterable. Raises an error if the iterable is not a + singleton.""" + out, iterator = _first_and_remaining_iterator(it) + if any(True for _ in iterator): + raise ValueError( + "Expected only 1 element in passed argument, instead there are at least 2 elements.", + ) + return out + + def scarf_analytics(): try: subprocess.check_output("nvidia-smi")