From ad59a879cc2a9b1074d58d80345a29b541389b6d Mon Sep 17 00:00:00 2001 From: Yao You Date: Fri, 29 Sep 2023 14:09:57 -0500 Subject: [PATCH] chore: bump inference to 0.6.6 (#1563) - bump `unstructured-inference` to `0.6.6` - specify default model name for element detection to be `detectron2_onnx` to keep current behavior - NOTE: the updated inference package by default would use yolox as element detection model; this will be evaluated and enabled in a separated PR --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet --- CHANGELOG.md | 11 +- requirements/constraints.in | 7 +- requirements/dev.txt | 12 +- requirements/extra-paddleocr.txt | 7 +- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 9 +- requirements/huggingface.txt | 2 +- requirements/ingest-airtable.txt | 2 +- requirements/ingest-azure.txt | 2 +- requirements/ingest-box.txt | 2 +- requirements/ingest-gcs.txt | 3 +- requirements/ingest-github.txt | 2 +- requirements/ingest-google-drive.txt | 3 +- requirements/ingest-notion.txt | 22 +- requirements/ingest-onedrive.txt | 2 +- requirements/ingest-openai.txt | 17 +- requirements/ingest-outlook.txt | 2 +- requirements/ingest-salesforce.txt | 2 +- requirements/ingest-sharepoint.txt | 2 +- requirements/test.txt | 4 +- .../create-and-check-es.sh | 2 +- .../partition/pdf-image/test_image.py | 2 +- .../partition/pdf-image/test_pdf.py | 4 +- test_unstructured/partition/test_auto.py | 2 +- ...iomedical-Data-Scientists-2-pages.pdf.json | 1298 +---------------- .../azure/IRS-form-1987.png.json | 19 - .../biomed-api/65/11/main.PMC6312790.pdf.json | 208 +-- .../biomed-api/75/29/main.PMC6312793.pdf.json | 462 +----- .../layout-parser-paper.pdf.json | 584 +------- .../2023-Jan-economic-outlook.pdf.json | 758 ++-------- .../small-pdf-set/Silent-Giant-(1).pdf.json | 270 +--- .../recalibrating-risk-report.pdf.json | 410 +----- unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 6 +- 34 files changed, 333 insertions(+), 3809 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f0fd3dd6c3..264cf9f44e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,13 @@ -## 0.10.19-dev0 +## 0.10.19-dev1 + +### Enhancements + +* **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images. + +### Features + +### Fixes + ## 0.10.18 diff --git a/requirements/constraints.in b/requirements/constraints.in index 59f1d35dc2..19a6775177 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -39,5 +39,8 @@ matplotlib==3.7.2 # NOTE(crag) - pin to available pandas for python 3.8 (at least in CI) fsspec==2023.9.1 pandas<2.0.4 -# langchain limits this to 3.1.7 -anyio==3.1.7 +# langchain limits anyio to below 4.0 +anyio<4.0 +# pinned in unstructured paddleocr +opencv-python==4.8.0.76 +opencv-contrib-python==4.8.0.76 diff --git a/requirements/dev.txt b/requirements/dev.txt index f785ea00bd..b90b4776d6 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -4,8 +4,10 @@ # # pip-compile requirements/dev.in # -anyio==4.0.0 - # via jupyter-server +anyio==3.7.1 + # via + # -c requirements/constraints.in + # jupyter-server appnope==0.1.3 # via # ipykernel @@ -42,7 +44,7 @@ certifi==2023.7.22 # -c requirements/constraints.in # -c requirements/test.txt # requests -cffi==1.15.1 +cffi==1.16.0 # via argon2-cffi-bindings cfgv==3.4.0 # via pre-commit @@ -151,7 +153,7 @@ jupyter-client==8.3.1 # qtconsole jupyter-console==6.6.3 # via jupyter -jupyter-core==5.3.1 +jupyter-core==5.3.2 # via # -c requirements/constraints.in # ipykernel @@ -393,7 +395,7 @@ urllib3==1.26.16 # requests virtualenv==20.24.5 # via pre-commit -wcwidth==0.2.6 +wcwidth==0.2.7 # via prompt-toolkit webcolors==1.13 # via jsonschema diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index ada01fb2a6..1f028530d2 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -33,7 +33,7 @@ cssselect==1.2.0 # via premailer cssutils==2.7.1 # via premailer -cycler==0.11.0 +cycler==0.12.0 # via matplotlib cython==3.0.2 # via unstructured-paddleocr @@ -112,9 +112,12 @@ numpy==1.24.4 # unstructured-paddleocr # visualdl opencv-contrib-python==4.8.0.76 - # via unstructured-paddleocr + # via + # -c requirements/constraints.in + # unstructured-paddleocr opencv-python==4.8.0.76 # via + # -c requirements/constraints.in # imgaug # unstructured-paddleocr openpyxl==3.1.2 diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index f22311f875..fa9cbcda5a 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -5,7 +5,7 @@ pdf2image pdfminer.six # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.5.31 +unstructured-inference==0.6.6 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats # from one tesseract call unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index aebc5d8dee..679ffef54c 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -11,7 +11,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via @@ -24,7 +24,7 @@ contourpy==1.1.1 # via matplotlib cryptography==41.0.4 # via pdfminer-six -cycler==0.11.0 +cycler==0.12.0 # via matplotlib effdet==0.4.1 # via layoutparser @@ -95,6 +95,7 @@ onnxruntime==1.16.0 # via unstructured-inference opencv-python==4.8.0.76 # via + # -c requirements/constraints.in # layoutparser # unstructured-inference packaging==23.1 @@ -213,7 +214,7 @@ tqdm==4.66.1 # huggingface-hub # iopath # transformers -transformers==4.33.2 +transformers==4.33.3 # via unstructured-inference typing-extensions==4.8.0 # via @@ -224,7 +225,7 @@ typing-extensions==4.8.0 # torch tzdata==2023.3 # via pandas -unstructured-inference==0.5.31 +unstructured-inference==0.6.6 # via -r requirements/extra-pdf-image.in unstructured-pytesseract==0.3.12 # via diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 03fdd36ce2..00ba71293a 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -97,7 +97,7 @@ tqdm==4.66.1 # huggingface-hub # sacremoses # transformers -transformers==4.33.2 +transformers==4.33.3 # via -r requirements/huggingface.in typing-extensions==4.8.0 # via diff --git a/requirements/ingest-airtable.txt b/requirements/ingest-airtable.txt index 1b535a5db6..db7e92a6a1 100644 --- a/requirements/ingest-airtable.txt +++ b/requirements/ingest-airtable.txt @@ -21,7 +21,7 @@ inflection==0.5.1 # via pyairtable pyairtable==2.1.0.post1 # via -r requirements/ingest-airtable.in -pydantic==1.10.12 +pydantic==1.10.13 # via # -c requirements/constraints.in # pyairtable diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index e9eadb8deb..e682d29422 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -30,7 +30,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via # azure-datalake-store # cryptography diff --git a/requirements/ingest-box.txt b/requirements/ingest-box.txt index bc022a226c..79268b6b3d 100644 --- a/requirements/ingest-box.txt +++ b/requirements/ingest-box.txt @@ -15,7 +15,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index 5533294fbc..4f6d048137 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -47,7 +47,7 @@ google-api-core==2.12.0 # via # google-cloud-core # google-cloud-storage -google-auth==2.23.0 +google-auth==2.23.2 # via # gcsfs # google-api-core @@ -107,7 +107,6 @@ urllib3==1.26.16 # via # -c requirements/base.txt # -c requirements/constraints.in - # google-auth # requests yarl==1.9.2 # via aiohttp diff --git a/requirements/ingest-github.txt b/requirements/ingest-github.txt index 865778e014..ad5ac2a7a0 100644 --- a/requirements/ingest-github.txt +++ b/requirements/ingest-github.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via # cryptography # pynacl diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index 993a01d200..9f90bcc9ca 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -19,7 +19,7 @@ google-api-core==2.12.0 # via google-api-python-client google-api-python-client==2.101.0 # via -r requirements/ingest-google-drive.in -google-auth==2.23.0 +google-auth==2.23.2 # via # google-api-core # google-api-python-client @@ -63,5 +63,4 @@ urllib3==1.26.16 # via # -c requirements/base.txt # -c requirements/constraints.in - # google-auth # requests diff --git a/requirements/ingest-notion.txt b/requirements/ingest-notion.txt index fadccceea2..b200c2562e 100644 --- a/requirements/ingest-notion.txt +++ b/requirements/ingest-notion.txt @@ -4,33 +4,35 @@ # # pip-compile requirements/ingest-notion.in # -certifi==2023.7.22 +anyio==3.7.1 # via - # -c requirements/base.txt # -c requirements/constraints.in - # httpx -charset-normalizer==3.2.0 + # httpcore +certifi==2023.7.22 # via # -c requirements/base.txt + # -c requirements/constraints.in + # httpcore # httpx -h11==0.12.0 +exceptiongroup==1.1.3 + # via anyio +h11==0.14.0 # via httpcore htmlbuilder==1.0.0 # via -r requirements/ingest-notion.in -httpcore==0.13.3 +httpcore==0.18.0 # via httpx -httpx==0.20.0 +httpx==0.25.0 # via notion-client idna==3.4 # via # -c requirements/base.txt + # anyio # httpx - # rfc3986 notion-client==2.0.0 # via -r requirements/ingest-notion.in -rfc3986[idna2008]==1.5.0 - # via httpx sniffio==1.3.0 # via + # anyio # httpcore # httpx diff --git a/requirements/ingest-onedrive.txt b/requirements/ingest-onedrive.txt index cb5c5903cb..2d9627f1d4 100644 --- a/requirements/ingest-onedrive.txt +++ b/requirements/ingest-onedrive.txt @@ -15,7 +15,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via diff --git a/requirements/ingest-openai.txt b/requirements/ingest-openai.txt index a2854493a0..d7846c0a08 100644 --- a/requirements/ingest-openai.txt +++ b/requirements/ingest-openai.txt @@ -10,6 +10,10 @@ aiohttp==3.8.5 # openai aiosignal==1.3.1 # via aiohttp +anyio==3.7.1 + # via + # -c requirements/constraints.in + # langchain async-timeout==4.0.3 # via # aiohttp @@ -30,6 +34,8 @@ dataclasses-json==0.6.1 # via # -c requirements/base.txt # langchain +exceptiongroup==1.1.3 + # via anyio frozenlist==1.4.0 # via # aiohttp @@ -37,9 +43,14 @@ frozenlist==1.4.0 idna==3.4 # via # -c requirements/base.txt + # anyio # requests # yarl -langchain==0.0.298 +jsonpatch==1.33 + # via langchain +jsonpointer==2.4 + # via jsonpatch +langchain==0.0.304 # via -r requirements/ingest-openai.in langsmith==0.0.41 # via langchain @@ -69,7 +80,7 @@ packaging==23.1 # via # -c requirements/base.txt # marshmallow -pydantic==1.10.12 +pydantic==1.10.13 # via # -c requirements/constraints.in # langchain @@ -87,6 +98,8 @@ requests==2.31.0 # langsmith # openai # tiktoken +sniffio==1.3.0 + # via anyio sqlalchemy==2.0.21 # via langchain tenacity==8.2.3 diff --git a/requirements/ingest-outlook.txt b/requirements/ingest-outlook.txt index 508d7573dd..ccef36d349 100644 --- a/requirements/ingest-outlook.txt +++ b/requirements/ingest-outlook.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via diff --git a/requirements/ingest-salesforce.txt b/requirements/ingest-salesforce.txt index 116a00eab0..a6c31b1014 100644 --- a/requirements/ingest-salesforce.txt +++ b/requirements/ingest-salesforce.txt @@ -11,7 +11,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via diff --git a/requirements/ingest-sharepoint.txt b/requirements/ingest-sharepoint.txt index 7a209b1042..99d1efbfde 100644 --- a/requirements/ingest-sharepoint.txt +++ b/requirements/ingest-sharepoint.txt @@ -9,7 +9,7 @@ certifi==2023.7.22 # -c requirements/base.txt # -c requirements/constraints.in # requests -cffi==1.15.1 +cffi==1.16.0 # via cryptography charset-normalizer==3.2.0 # via diff --git a/requirements/test.txt b/requirements/test.txt index fe8dc02504..98d40fd188 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -74,7 +74,7 @@ pluggy==1.3.0 # via pytest pycodestyle==2.11.0 # via flake8 -pydantic==1.10.12 +pydantic==1.10.13 # via # -c requirements/constraints.in # -r requirements/test.in @@ -113,7 +113,7 @@ types-click==7.1.8 # via -r requirements/test.in types-markdown==3.4.2.10 # via -r requirements/test.in -types-requests==2.31.0.5 +types-requests==2.31.0.6 # via -r requirements/test.in types-tabulate==0.9.0.3 # via -r requirements/test.in diff --git a/scripts/elasticsearch-test-helpers/create-and-check-es.sh b/scripts/elasticsearch-test-helpers/create-and-check-es.sh index 62f7cb6b66..44fca2f7d3 100755 --- a/scripts/elasticsearch-test-helpers/create-and-check-es.sh +++ b/scripts/elasticsearch-test-helpers/create-and-check-es.sh @@ -9,7 +9,7 @@ docker run -d --rm -p 9200:9200 -p 9300:9300 -e "xpack.security.enabled=false" - echo "Waiting for Elasticsearch container to start..." sleep 1 -url="http://localhost:9200/_cluster/health" +url="http://localhost:9200/_cluster/health?wait_for_status=green&timeout=50s" status_code=0 retry_count=0 max_retries=6 diff --git a/test_unstructured/partition/pdf-image/test_image.py b/test_unstructured/partition/pdf-image/test_image.py index ce026767a1..e2c9496356 100644 --- a/test_unstructured/partition/pdf-image/test_image.py +++ b/test_unstructured/partition/pdf-image/test_image.py @@ -440,7 +440,7 @@ def test_partition_image_formats_languages_for_tesseract(): ocr_languages="jpn_vert", ocr_mode="entire_page", extract_tables=False, - model_name=None, + model_name="detectron2_onnx", ) diff --git a/test_unstructured/partition/pdf-image/test_pdf.py b/test_unstructured/partition/pdf-image/test_pdf.py index d5dfcb8189..6cf4e93894 100644 --- a/test_unstructured/partition/pdf-image/test_pdf.py +++ b/test_unstructured/partition/pdf-image/test_pdf.py @@ -407,7 +407,7 @@ def test_partition_pdf_with_dpi(): ocr_languages="eng", ocr_mode="entire_page", extract_tables=False, - model_name=None, + model_name="detectron2_onnx", pdf_image_dpi=100, ) @@ -858,7 +858,7 @@ def test_partition_pdf_formats_languages_for_tesseract(): ocr_languages="eng", ocr_mode="entire_page", extract_tables=False, - model_name=None, + model_name="detectron2_onnx", ) diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index f3e91e6a6b..dcacf01ba2 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -381,7 +381,7 @@ def test_auto_partition_formats_languages_for_tesseract(): ocr_languages="chi_sim+chi_sim_vert+chi_tra+chi_tra_vert", ocr_mode="entire_page", extract_tables=False, - model_name=None, + model_name="detectron2_onnx", ) diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index 7cbf4decf9..b37185fd27 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -1,7 +1,7 @@ [ { "type": "Title", - "element_id": "cf66bb0e9e68e3a82a99b5621e4394f8", + "element_id": "0b8804afbc4722108e877480e28462a6", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -16,30 +16,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Core Skills for Biomedical Data" + "text": "Core Skills for Biomedical Data Scientists" }, { - "type": "Title", - "element_id": "733383a5f0f5bdea71d6d48805365e6f", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Scientists" - }, - { - "type": "Title", - "element_id": "64b2134f054446d473fce1b05d4d4c94", + "type": "NarrativeText", + "element_id": "46b1e4dae5ffd7cdcb2a6ed9f206a8ee", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -77,159 +58,7 @@ }, { "type": "Title", - "element_id": "f089eaef57aba315bc0e1455985c0c8e", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Michael" - }, - { - "type": "UncategorizedText", - "element_id": "fd0a559e715a134218c73276dc57d463", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "F." - }, - { - "type": "UncategorizedText", - "element_id": "44be44eccd482217c097571ddfa61f49", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Huerta," - }, - { - "type": "Title", - "element_id": "394df19f0626f36d12da449624b691f9", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "PhD, Associate" - }, - { - "type": "Title", - "element_id": "4f5a6389c571b0d01690b1db0349c1b4", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Director of" - }, - { - "type": "Title", - "element_id": "aecfc6e5b6c0de37a2c06c2fb1d71c82", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "NLM" - }, - { - "type": "Title", - "element_id": "237622d8c80fbdbe790b92d500aa7b00", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "for Program Development and" - }, - { - "type": "Title", - "element_id": "aecfc6e5b6c0de37a2c06c2fb1d71c82", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "NLM" - }, - { - "type": "Title", - "element_id": "ba490653e1ad81f341c35ae470c1b825", + "element_id": "d9644fb4b85468d186b132c91ca64f31", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -244,7 +73,7 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "Coordinator of Data Science and Open Science Initiatives" + "text": "Michael F. Huerta, PhD, Associate Director of NLM for Program Development and NLM Coordinator of Data Science and Open Science Initiatives" }, { "type": "Title", @@ -265,28 +94,9 @@ }, "text": "Executive Summary" }, - { - "type": "Title", - "element_id": "6712d87f1d156abf6171f700e2875889", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "biomedical" - }, { "type": "NarrativeText", - "element_id": "2364a6d2f9a3858d51d91b817732e6c9", + "element_id": "d6df9cd66da09d30c16d194e877766ca", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -301,11 +111,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "This report provides recommendations for a scientists based on analysis that draws on opinions of data scientists, curricula for existing science requirements science jobs." + "text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs. Suggested high-level core skills include:" }, { - "type": "Title", - "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", + "type": "ListItem", + "element_id": "d94c6241299e6eff20ee6499cb9f64de", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -320,68 +130,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "data" + "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science; 2. Programming language expertise: biomedical data scientists should be fluent in at least one programming language (typically R and/or Python); 3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science; 4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science. 5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy." }, { "type": "UncategorizedText", - "element_id": "50e891aa619a7ccbeab043789ca5dd1a", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "programs," - }, - { - "type": "Title", - "element_id": "6201111b83a0cb5b0922cb37cc442b9a", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "and" - }, - { - "type": "Title", - "element_id": "a703788f832056626d71b7db4d805524", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "minimal" - }, - { - "type": "Title", - "element_id": "6ee0eb490ff832101cf82a3d387c35f2", + "element_id": "34b28172088bba51c6764df6d4e87674", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -396,11 +149,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "set" + "text": "The report further details specific skills and expertise relevant to biomedical data scientists." }, { "type": "Title", - "element_id": "10c22bcf4c768b515be4e94bcafc71bf", + "element_id": "89b1f4c3df983454e25b233320781610", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -415,11 +168,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "for" + "text": "Motivation" }, { - "type": "Title", - "element_id": "28391d3bc64ec15cbb090426b04aa6b7", + "type": "NarrativeText", + "element_id": "cfe4cc76625dc82267d95ec1dc7e7813", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -434,11 +187,11 @@ "filetype": "application/pdf", "page_number": 1 }, - "text": "of" + "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with" }, { - "type": "Title", - "element_id": "6712d87f1d156abf6171f700e2875889", + "type": "UncategorizedText", + "element_id": "68431de56564c6ad6aa3e6c02b78c89c", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -451,13 +204,13 @@ "date_modified": "2023-03-10T09:32:44+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "biomedical" + "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________" }, { "type": "Title", - "element_id": "0d45f5fd462b8c70bffb10021ac1bcff", + "element_id": "edd5f2f5a60a83c8899e533ac8bcd03c", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -470,13 +223,13 @@ "date_modified": "2023-03-10T09:32:44+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "core" + "text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce." }, { "type": "Title", - "element_id": "50c5080f67ea1f9eff473e46e6314fd2", + "element_id": "3c36cd10b2e64b9f2169f05abddd4981", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -489,13 +242,13 @@ "date_modified": "2023-03-10T09:32:44+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "skills for biomedical" + "text": "Methodology" }, { "type": "Title", - "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", + "element_id": "987542acede56f098db655f02fb814a7", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -508,13 +261,13 @@ "date_modified": "2023-03-10T09:32:44+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "data" + "text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:" }, { - "type": "Title", - "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", + "type": "ListItem", + "element_id": "fdd38e2d80cc964e9bf3c7e09a760e21", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -527,982 +280,13 @@ "date_modified": "2023-03-10T09:32:44+00:00" }, "filetype": "application/pdf", - "page_number": 1 + "page_number": 2 }, - "text": "data" + "text": "a) Responses to a 2017 Kaggle' survey’ of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use. b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A. c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (83.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The" }, { "type": "NarrativeText", - "element_id": "18f107bf25f694db07b6aba0a5aaf321", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Suggested high-level core skills include:" - }, - { - "type": "ListItem", - "element_id": "8f90f5970c85f335b1bf50af611ce5c5", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;" - }, - { - "type": "ListItem", - "element_id": "d1a5bb898aee8de0fbdf048c7a9fb01d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "2. Programming language expertise: biomedical data scientists should be fluent in at" - }, - { - "type": "Title", - "element_id": "18e42d24d6449a9b52fc65fc3f9710b4", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "least one programming language (typically R and/or Python);" - }, - { - "type": "ListItem", - "element_id": "c6be5389b7bd00746d39b7bac468dea0", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;" - }, - { - "type": "ListItem", - "element_id": "1b8039583cbc15f654c89f2141eb6e10", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science." - }, - { - "type": "ListItem", - "element_id": "2f87757b1d497a32c077be543632ed7d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy." - }, - { - "type": "UncategorizedText", - "element_id": "34b28172088bba51c6764df6d4e87674", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "The report further details specific skills and expertise relevant to biomedical data scientists." - }, - { - "type": "Title", - "element_id": "89b1f4c3df983454e25b233320781610", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Motivation" - }, - { - "type": "NarrativeText", - "element_id": "3d8fbacaba9067faef48850d43801268", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2k) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with" - }, - { - "type": "UncategorizedText", - "element_id": "68431de56564c6ad6aa3e6c02b78c89c", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________" - }, - { - "type": "NarrativeText", - "element_id": "326e7d081e9418423ea62bf3802caaa3", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "this commitment, recent report to the NLM Director recommended working across identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce." - }, - { - "type": "Title", - "element_id": "ca978112ca1bbdcafac231b39a23dc4d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "a" - }, - { - "type": "Title", - "element_id": "acc8586a874eb74f10c3f90620f20617", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "NIH to" - }, - { - "type": "Title", - "element_id": "f26d07e6b71e42596791a241e2417931", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Methodology" - }, - { - "type": "Title", - "element_id": "b344d80e24a3679999fa964450b34bc2", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "The" - }, - { - "type": "Title", - "element_id": "aa3b88196a6407c3866c85acdcc8c981", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Workforce" - }, - { - "type": "NarrativeText", - "element_id": "cdc3773cb12cf99d302b9f00c48ae1e8", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "required of" - }, - { - "type": "NarrativeText", - "element_id": "b72b62f1295c66f199256c1190177ce6", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "three-pronged approach biomedical data scientist (BDS), drawing from:" - }, - { - "type": "Title", - "element_id": "3d366201f5b88bcbfafb078aee5f2a55", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Excellence" - }, - { - "type": "Title", - "element_id": "ca8b22d0db83a22db163b560b3e4e515", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "team" - }, - { - "type": "NarrativeText", - "element_id": "e0a6230e370d20dece7ca96c77611cb0", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "took" - }, - { - "type": "Title", - "element_id": "ca978112ca1bbdcafac231b39a23dc4d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "a" - }, - { - "type": "Title", - "element_id": "663ea1bfffe5038f3f0cf667f14c4257", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "to" - }, - { - "type": "NarrativeText", - "element_id": "a5bed2020bd1f4ea3eca933398c4f0d0", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "identifying" - }, - { - "type": "Title", - "element_id": "0d45f5fd462b8c70bffb10021ac1bcff", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "core" - }, - { - "type": "Title", - "element_id": "32c1cf49a2feee269ed74dd860f72644", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "skills" - }, - { - "type": "NarrativeText", - "element_id": "a24acaf1cb5d6f8a0a0af0e81949765b", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use." - }, - { - "type": "Title", - "element_id": "301d35f1042e1eac9fdef8839fd13a4e", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "b)" - }, - { - "type": "Title", - "element_id": "6b847a0ed0b2c484c73f2749e29b4db5", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "into" - }, - { - "type": "NarrativeText", - "element_id": "1117af46b0a22dd02d3869ab9738a8a8", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Data science skills taught in BD2K-funded training programs. A qualitative content analysis applied to the descriptions of required offered under the BD2kK-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A." - }, - { - "type": "NarrativeText", - "element_id": "b63b99f6383ba713b57ddfc77737c5f7", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "was" - }, - { - "type": "Title", - "element_id": "936e5cc5021d8a075f91b7864bf0cec8", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "courses" - }, - { - "type": "UncategorizedText", - "element_id": "6b51d431df5d7f141cbececcf79edf3d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "12" - }, - { - "type": "Title", - "element_id": "2d2e9ceb1db2bc94a266f3e8b24b8f55", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "c)" - }, - { - "type": "Title", - "element_id": "6d0607a7a2ac9823f9fb2a62ea2b7385", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Desired" - }, - { - "type": "NarrativeText", - "element_id": "f9c94ebffe2ab721a096cf42b7a9cff9", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "important skills that were mentioned multiple times in" - }, - { - "type": "NarrativeText", - "element_id": "961a38da2886c3cc25091d912769aa0d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "job job government (8.5%), (42.4%), industry (83.9%), and nonprofit (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting single ad." - }, - { - "type": "Title", - "element_id": "32c1cf49a2feee269ed74dd860f72644", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "skills" - }, - { - "type": "NarrativeText", - "element_id": "a486fbc90cd5a32fe44275f5948b2066", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "identified" - }, - { - "type": "Title", - "element_id": "de98e5ea566225a14a9a6b3086253f6d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "academia" - }, - { - "type": "Title", - "element_id": "75857a45899985be4c4d941e90b6b396", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "from" - }, - { - "type": "Title", - "element_id": "3a6eb0790f39ac87c94f3856b2dd2c5d", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "data" - }, - { - "type": "Title", - "element_id": "8b3a4555f5297c340e5fdff392fe5a5b", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "science-related" - }, - { - "type": "Title", - "element_id": "26f8fe3e12ff690c91f73b24bb45ed01", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "utilized" - }, - { - "type": "Title", - "element_id": "b510c96f289ebcf388da7d2dea6a1e73", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "ads." - }, - { - "type": "Title", - "element_id": "b9776d7ddf459c9ad5b0e1d6ac61e27b", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "the" - }, - { - "type": "UncategorizedText", - "element_id": "3e1e967e9b793e908f8eae83c74dba9b", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "59" - }, - { - "type": "Title", - "element_id": "788eb2efc52660fe41472319f0d2c623", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "ads" - }, - { - "type": "Title", - "element_id": "9d5d7fcf3aa35a4809f92551aed1f26e", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "sector" - }, - { - "type": "Title", - "element_id": "75857a45899985be4c4d941e90b6b396", - "metadata": { - "data_source": { - "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": 167189396509615428390709838081557906335, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "date_created": "2023-03-10T09:32:44+00:00", - "date_modified": "2023-03-10T09:32:44+00:00" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "from" - }, - { - "type": "Title", - "element_id": "9f25a5b0f5e247294ebcf6723c2169b2", + "element_id": "3f14cc0782485365bad0539f7b1bbb22", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -1517,11 +301,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "for core skills necessary for" + "text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad." }, { "type": "NarrativeText", - "element_id": "f7f4976ebe430b482f073e28add58182", + "element_id": "c2e95867ed0f25e3d9fe1a6b97447ab9", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -1536,11 +320,11 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations competitive biomedical data scientist." + "text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist." }, { "type": "NarrativeText", - "element_id": "4a99b0f26eb7267230c6994d9ab7d60b", + "element_id": "8e6dc8d9bc74e032451cc1a6a0da4d10", "metadata": { "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", @@ -1555,7 +339,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "' Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com ? In August 2017, Kaggle conducted an industry-wide survey to gain clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017" + "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017" }, { "type": "UncategorizedText", diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json index 5afaa3fefc..2034680177 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json @@ -360,25 +360,6 @@ }, "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed. Other methods. —Unless the Service has published a regulation or procedure to the contrary, all other changes in accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of mcome attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes." }, - { - "type": "NarrativeText", - "element_id": "7685df2334a5f6c8c8099dea61a8f1b4", - "metadata": { - "data_source": { - "url": "abfs://container1/IRS-form-1987.png", - "version": 328871203465633719836776597535876541325, - "record_locator": { - "protocol": "abfs", - "remote_file_path": "container1/IRS-form-1987.png" - }, - "date_created": "2023-03-10T09:44:55+00:00", - "date_modified": "2023-03-10T09:44:55+00:00" - }, - "filetype": "image/png", - "page_number": 1 - }, - "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed." - }, { "type": "Title", "element_id": "5756fb398995bb6518a87637f24f426e", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 9c98b4af47..35d4a581e4 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -30,8 +30,8 @@ "text": "Data in Brief" }, { - "type": "NarrativeText", - "element_id": "9234133787d0a6b3976b16569c0b5cf3", + "type": "Title", + "element_id": "0ca3f075fdccf9232449ff461b63ceb9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -161,63 +161,23 @@ }, { "type": "Title", - "element_id": "b877cc5d670d770084dcc0bb41ac73a0", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Subject area More specific subject area Type of data" - }, - { - "type": "Title", - "element_id": "b27e559f6c00d2bde61efba5db252e31", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Materials engineering" - }, - { - "type": "Title", - "element_id": "a2c3879ecb580742973c6a914fb905bb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Surface science and engineering" - }, - { - "type": "Title", - "element_id": "1064dcef42380cfdb90c668aa3a670a3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 1 - }, - "text": "Table and figure" - }, - { - "type": "Title", - "element_id": "e4359c72057b318ddf5a64f9b97539c4", + "element_id": "ac89a2886224c42ad15982cd34421ff8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "n Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za" + "text": "Subject area More specific subject area Surface science and engineering Type of data" }, { - "type": "Title", - "element_id": "e102dc7c1db28c29d5e4bde8062592ed", + "type": "NarrativeText", + "element_id": "0a789b33a0101a46f5a01d22d9a6ce2b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 1 }, - "text": "E-mail address: tayo.sanni@yahoo.com (O. Sanni)." + "text": "* Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za E-mail address: tayo.sanni@yahoo.com (O. Sanni)." }, { "type": "NarrativeText", @@ -310,44 +270,14 @@ "text": "Value of the data" }, { - "type": "NarrativeText", - "element_id": "682e6210329b84f8b00548088196ffc9", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:1) Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment." - }, - { - "type": "NarrativeText", - "element_id": "1d61e3468bc681ba1a7e647000c6828c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:1) The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments." - }, - { - "type": "NarrativeText", - "element_id": "39b6040280a179e1f8e4f4fb5ec4ae05", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:1) The data can be used to examine the relationship between the process variable as it affect the" - }, - { - "type": "Title", - "element_id": "1ddde62c3188f81dfc835b6f036f1734", + "type": "ListItem", + "element_id": "7def44ffc91f3f064b85dc04b23767ec", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "nature of inhibition of metals." + "text": "© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment. © The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments. © The data can be used to examine the relationship between the process variable as it affect the nature of inhibition of metals." }, { "type": "Title", @@ -529,25 +459,15 @@ }, "text": "Exposure Time (Hours)" }, - { - "type": "UncategorizedText", - "element_id": "25db7b1d2f5780559e1034d72bcb4050", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Fig. 1. Weight loss versus exposure time for stainless steel presence of ES." - }, { "type": "NarrativeText", - "element_id": "cbd563dd2fcd7d0b5a0b2173465fd328", + "element_id": "45cd54c64e38abe8c1128a5979ca8cd5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "immersed in 0.5 M H2SO4 solution in the absence and" + "text": "Fig. 1. Weight loss versus exposure time for stainless steel immersed in 0.5M H2SO, solution in the absence and presence of ES." }, { "type": "NarrativeText", @@ -1080,14 +1000,14 @@ "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, { - "type": "FigureCaption", - "element_id": "27b45633a0f31b9e01d179d70d7dc282", + "type": "Image", + "element_id": "b5ee6af3d776b0bbd2e581a3ab2ab2e1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "5 1 os = — 10; =o ° © —\" 205 i —~é é —ip a5 — Control -2 — & 2.5 T T T 0.0000001 + —-0.00001 0.001 O14 Current Density (A/cm2)" + "text": "Potential (Vv)nm°in°}aryT T T0.00001 0.001 olCurrent Density (A/cm2)" }, { "type": "UncategorizedText", @@ -1450,34 +1370,34 @@ "text": "455" }, { - "type": "FigureCaption", - "element_id": "273fb301b173075f79b2cbdab962e2ff", + "type": "Image", + "element_id": "caa364fead90039aae1f13d64dcb8b37", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "SEM HV: Q0KY WD: 14.89 rmrm ‘9EM MAO: 209 x Det: DOE Pectomsence In nanospact" + "text": "SEM HV: Q0KY WD: 14.89 rmrm‘DEM MAO: 209 x ‘Dor Pecforsence In nenospact" }, { - "type": "FigureCaption", - "element_id": "520d1da08c86ce165cd2843e2dc27f98", + "type": "Image", + "element_id": "a0463ca888a6f2c8c3ba40ba47be0f2f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "SEMHV: 20.0KV WD: 15.54 mm EM ING: ACO x Dei: OSE" + "text": "gEOOwaeSemny. z00RV | WD: 1424 renn rtirint VEoa3 Tescan20 yin Fertormaros in nancepace|" }, { - "type": "FigureCaption", - "element_id": "d04d110c16a4ebc184fa130f09b8d423", + "type": "Image", + "element_id": "88301d6b47b17df03b78789b9890a6f1", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "Sem ny. 200 Rv" + "text": "°@¢Naafe«MgsSEM HY: 20.0KV 7 ETOP LU ULL UL OCT 0BEM IAAG: 400 x a" }, { "type": "NarrativeText", @@ -1530,7 +1450,7 @@ "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457" }, { - "type": "ListItem", + "type": "Title", "element_id": "a80826543c9e0d0e9f6c2108ae3c3f73", "metadata": { "data_source": {}, @@ -1560,17 +1480,7 @@ "text": "Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [3–5]. The structural formula of egg shell powder is shown in Fig. 9." }, { - "type": "FigureCaption", - "element_id": "060e14f01e484ba252e902cd5c6f94f9", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "ou H,;COCHNY OH" - }, - { - "type": "Title", + "type": "NarrativeText", "element_id": "1dc2692eee9b01e9a960f80c4dabe07b", "metadata": { "data_source": {}, @@ -1890,74 +1800,14 @@ "text": "References" }, { - "type": "NarrativeText", - "element_id": "d844a31ead19b2e2fae786d2a5495072", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "[1] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution" - }, - { - "type": "NarrativeText", - "element_id": "d0be94eaaf9c0f43bc51381f031e1381", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "using eco-friendly waste product, Results Phys. 9 (2018) 225–230." - }, - { - "type": "NarrativeText", - "element_id": "7e9cfcc1c32c353e319aae7d9be537bd", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "[2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion" - }, - { - "type": "NarrativeText", - "element_id": "c00e8be0806aa2ded72da0ef746a4291", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1–15." - }, - { - "type": "NarrativeText", - "element_id": "1d76a4bb6ba7984cea4548ab574beb8f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "[3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel" - }, - { - "type": "NarrativeText", - "element_id": "ffd9e4babdf76600a881851ebbf35d3f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 7 - }, - "text": "corrosion in chloride solution, Def. Technol. 14 (2018) 463–468." - }, - { - "type": "NarrativeText", - "element_id": "dd7f4838500dd709556225fa3f6b7339", + "type": "ListItem", + "element_id": "86174db2f99ff948055caeda83334bb7", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 7 }, - "text": "[4] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1–17. https://doi.org/10.1007/ s13632-018-0495-5." + "text": "[1] 0. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results Phys. 9 (2018) 225-230. [2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1-15. [3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel corrosion in chloride solution, Def. Technol. 14 (2018) 463-468. [4] O. Sanni, A.P.I. Popoola, 0.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1-17. https://doi.org/10.1007/ $13632-018-0495-5, [5] O. Sanni, A-P.I. Popoola, O.S.1. Fayomi, Inhibition of engineering material in sulphuric acid solution using waste product, Contributed Papers from Materials Science and Technology (MS&T18), 2018. (lnttps://doi.org/10.7449/2018/MST_2018_254 261)." }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index d7bdce8ec2..5d4295f490 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -30,8 +30,8 @@ "text": "Data in Brief" }, { - "type": "NarrativeText", - "element_id": "9234133787d0a6b3976b16569c0b5cf3", + "type": "Title", + "element_id": "0ca3f075fdccf9232449ff461b63ceb9", "metadata": { "data_source": {}, "filetype": "application/pdf", @@ -250,77 +250,27 @@ "text": "Value of the data" }, { - "type": "NarrativeText", - "element_id": "f2fdefc49840022ffb3a88bd4a3512d0", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the" - }, - { - "type": "Title", - "element_id": "bd7d750cb9f652c80c17a264072b8858", + "type": "ListItem", + "element_id": "510d0bce379a0d3ba5ff46d536bdb7c5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "performance of the algorithms for the MDVSP." + "text": "© The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the performance of the algorithms for the MDVSP. © The data provide all the information that is required to model the MDVSP by using the existing mathematical formulations. e All the problem instances are available for use without any restrictions. e The benchmark solutions and solution time for the problem instances are presented in [3] and can be used for the comparison. © The dataset includes a program that can generate similar problem instances of different sizes." }, { "type": "NarrativeText", - "element_id": "7c8bc2811f71480b433eb6fee2a3bb33", + "element_id": "f2fdefc49840022ffb3a88bd4a3512d0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing" + "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the" }, { "type": "Title", - "element_id": "68d39f7bcfe99749cc221fa901314626", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "mathematical formulations." - }, - { - "type": "NarrativeText", - "element_id": "e69dab6e2bc16d11cfd2d80a804d89fb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can" - }, - { - "type": "NarrativeText", - "element_id": "24d7f2ed4386a169639b93a5bf03fd79", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "be used for the comparison." - }, - { - "type": "NarrativeText", - "element_id": "1c1d6b35ac0925a35ea3bb4d018e675f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes." - }, - { - "type": "ListItem", "element_id": "c2b2b778d53cc9a1cb4dc340476bc5aa", "metadata": { "data_source": {}, @@ -330,134 +280,24 @@ "text": "1. Data" }, { - "type": "NarrativeText", - "element_id": "41ce7670e476aaf9a595bc28c13dbba0", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number" - }, - { - "type": "Title", - "element_id": "10c22bcf4c768b515be4e94bcafc71bf", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "for" - }, - { - "type": "NarrativeText", - "element_id": "a18c70d23b71c51ddfe33311232c241c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "‘RN-8-1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, (m,n), five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net." - }, - { - "type": "UncategorizedText", - "element_id": "aea66a7c89c6de4d3e3ed6c1ada31104", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "the size," - }, - { - "type": "UncategorizedText", - "element_id": "e0feab8a8888b2955af1cc1a2acff883", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "‘ðm; nÞ’," - }, - { - "type": "UncategorizedText", - "element_id": "0b113c91aaaf031e5d7b74747e1b4153", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "respectively. For example," - }, - { - "type": "UncategorizedText", - "element_id": "6dd3e9101394a1fbacb451c4c9ba03b9", + "type": "ListItem", + "element_id": "86e53159056da85c215281a9c68d46b9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "the problem instance," + "text": "For each problem instance, the following information is provided: The number of depots (m), The number of trips (n), The number of locations (I), The number of vehicles at each depot, For each tripie 1,2,...,n,a start time, ft}, an end time, ff, a start location, i, and an end location, i, and" }, { "type": "NarrativeText", - "element_id": "33d26eae1edf215a9677101c7147d671", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "For each problem instance, the following information is provided: The number of depots mð The number of trips ðnÞ, The number of locations ðlÞ, The number of vehicles at each depot, For each trip i A 1; 2; …; n, a start time, ts" - }, - { - "type": "UncategorizedText", - "element_id": "c6490fc185478150e7816c45ef8a48d5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "Þ," - }, - { - "type": "Title", - "element_id": "5a15b4000add06e52b66591cd8cac950", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "i , an end time, te" - }, - { - "type": "Title", - "element_id": "7798ae4daad9264de38e67c98f2bd624", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "i , a start location, ls" - }, - { - "type": "UncategorizedText", - "element_id": "801a0d00a5b76dbd0f039368ee45eda3", + "element_id": "07732da32c53fed3ffd5342c61ab643b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "i , and an end location, le i ," - }, - { - "type": "Title", - "element_id": "6201111b83a0cb5b0922cb37cc442b9a", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "and" + "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number ‘RN-8–1500-01.dat’, for is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8, 12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm; nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net." }, { "type": "NarrativeText", @@ -511,13 +351,13 @@ }, { "type": "NarrativeText", - "element_id": "faee1001fc912565a74ea2d69fa0d689", + "element_id": "694b9c582265698bf49806b056c64adc", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "travel empty from —¢). Aschedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satisfied:" + "text": "j , the vehicle must travel empty from le j (cid:3)te i Þ. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satisfied:" }, { "type": "NarrativeText", @@ -529,76 +369,6 @@ }, "text": "A trip j can be covered after trip i by the same vehicle, if ts j" }, - { - "type": "NarrativeText", - "element_id": "3e549e73bba49a63f20841b5821cfda9", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "i to ls" - }, - { - "type": "NarrativeText", - "element_id": "43dad32a26a446c5a2c74f3f2328b849", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": ". If le i ls le i j , otherwise, the vehicle may require waiting at le i for the duration of ðts" - }, - { - "type": "Title", - "element_id": "3feb623147ddb3265b5968ce2efb8f6b", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "Z te" - }, - { - "type": "NarrativeText", - "element_id": "5201e1037409ea15055e320409a9f5eb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "i þδ" - }, - { - "type": "Title", - "element_id": "189f40034be7a199f1fa9891668ee3ab", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "j" - }, - { - "type": "Title", - "element_id": "a10959d132f2b0d3723ae6b8b77f86b7", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "a ls" - }, - { - "type": "Title", - "element_id": "4137b01e139589b7a1d3b3fc4da031d8", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "must" - }, { "type": "ListItem", "element_id": "2d6b506bd58a7dd7bbf1c8599ef630c8", @@ -629,65 +399,25 @@ }, "text": "A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm; nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule." }, - { - "type": "Title", - "element_id": "252f10c83610ebca1a059c0bae8255eb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "f" - }, { "type": "NarrativeText", - "element_id": "928fa0dcad70f173bc989ee5715375c5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, i A 1; …; n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i; jA 1; …; l" - }, - { - "type": "UncategorizedText", - "element_id": "89507815c6b4a6f31e6d3da7fca6b561", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "(cid:1)" - }, - { - "type": "UncategorizedText", - "element_id": "33a2b57b388470db1cb13defbe73dc18", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "(cid:3)" - }, - { - "type": "UncategorizedText", - "element_id": "cdb4ee2aea69cc6a83331bbe96dc2caa", + "element_id": "e731dc92fddc0512e142bfb2bed62bbf", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "." + "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ." }, { "type": "NarrativeText", - "element_id": "e731dc92fddc0512e142bfb2bed62bbf", + "element_id": "92b491d0e108ec13f263b16646ecac65", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 3 }, - "text": "The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm; nÞ." + "text": "The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots (m), the number of trips, (n), and the number of locations (I), in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, ie{1,...,n}, and provides the start location, the start time, the end location, and the end time of trip i. The next | lines present the travel times between any two locations, i,j e {1, wal}. The dataset also includes a program ‘Generatelnstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots (m), the number of trips (n), and the" }, { "type": "UncategorizedText", @@ -850,34 +580,14 @@ "text": "Table 2 Description of file format for each problem instance." }, { - "type": "Title", - "element_id": "151e509ce97fe40eecae3822c78adcf5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "Number of lines" - }, - { - "type": "Title", - "element_id": "0d42fdb9458af19413eee0a1227f415c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "Number of columns in each line" - }, - { - "type": "Title", - "element_id": "526e0087cc3f254d9f86f6c7d8e23d95", + "type": "NarrativeText", + "element_id": "444f48f6d4f0ee6d3a04b7bf76218980", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "Description" + "text": "Number of Number of columns in Description lines each line" }, { "type": "UncategorizedText", @@ -920,47 +630,17 @@ "text": "l" }, { - "type": "Title", - "element_id": "336074805fc853987abe6f7fe3ad97a6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "time" - }, - { - "type": "NarrativeText", - "element_id": "78f6ff03dfac8dfb7f319de1e369590d", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rg at each depot d. One line for each trip, i= 1,2, ...,n. Each line provides the start location and the end time ¢¢ for the corresponding trip. Each element, 6j, where i,j ¢ 1,2, ...,1, refers to the travel time between location i and location j." - }, - { - "type": "Title", - "element_id": "8ee69286d5f681913dbfdeb60bedc572", + "type": "ListItem", + "element_id": "f096a8499e50cac1f45ceb8340dace5a", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "i , the end location le" + "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j." }, { "type": "Title", - "element_id": "08238905e7bba7115b7d7d58fef13ec6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "i , the start" - }, - { - "type": "ListItem", "element_id": "764eef872135149aaf95224bab69c844", "metadata": { "data_source": {}, @@ -1031,102 +711,12 @@ }, { "type": "NarrativeText", - "element_id": "5a1d84f7d74fc4ceeacb634d524cc041", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling" - }, - { - "type": "UncategorizedText", - "element_id": "bec40b25a277a08de3415e33284fc76d", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "problem, Networks 19 (5) (1989) 531–548." - }, - { - "type": "NarrativeText", - "element_id": "19dee0a4e8fd073350e234b4352b8af6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time–space network based exact optimization model for multi-depot bus scheduling, Eur." - }, - { - "type": "UncategorizedText", - "element_id": "5f5ca82752a3220998c06ea0c44eb80e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "J. Oper. Res. 175 (3) (2006) 1616–1627." - }, - { - "type": "UncategorizedText", - "element_id": "64cd13c78330953bd999d37dacbeaf0e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "[3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic" - }, - { - "type": "NarrativeText", - "element_id": "c4f2c64b5f38feaa921647abceebaec8", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457–487." - }, - { - "type": "NarrativeText", - "element_id": "16c341408703257ff517dcc76140e2c0", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling" - }, - { - "type": "UncategorizedText", - "element_id": "aa252076bc877d1ba2b95aa13b73ff72", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "problem, J. Sched. 12 (1) (2009) 17." - }, - { - "type": "UncategorizedText", - "element_id": "2e00441177bee9377583470218bea299", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 4 - }, - "text": "[5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1)" - }, - { - "type": "UncategorizedText", - "element_id": "4b1b8c9df00f25e26176a85d84c8c927", + "element_id": "ba0af0b44e7cc27de119a1771c07dfc2", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 4 }, - "text": "(1994) 41–52." + "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling problem, Networks 19 (5) (1989) 531-548. [2] N. Kliewer, T. Mellouli, L. Suhl, A time-space network based exact optimization model for multi-depot bus scheduling, Eur. J. Oper. Res. 175 (3) (2006) 1616-1627. [3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457-487. [4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling problem, J. Sched. 12 (1) (2009) 17. [5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1) (1994) 41-52." } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 30302a3ffa..99b11a3a14 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -169,16 +169,6 @@ }, "text": "37], layout detection [38, 22], table detection [26], and scene text detection [4]. A generalized learning-based framework dramatically reduces the need for the manual specification of complicated rules, which is the status quo with traditional methods. DL has the potential to transform DIA pipelines and benefit a broad spectrum of large-scale document digitization projects." }, - { - "type": "NarrativeText", - "element_id": "1f0f5df7c23d4f8e8de4de3085abd7d8", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "The library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research." - }, { "type": "NarrativeText", "element_id": "c1f1ba1630bc19bd24c1dfbc1548f2d8", @@ -201,73 +191,23 @@ }, { "type": "ListItem", - "element_id": "074b2bd4ba1bf0caf3dbf1973217416a", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "1. An off-the-shelf toolkit for applying DL models for layout detection, character" - }, - { - "type": "ListItem", - "element_id": "569ce8891b02bc38f50a0cde0039e951", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "2. A rich repository of pre-trained neural network models (Model Zoo) that" - }, - { - "type": "ListItem", - "element_id": "18dcbc2839f9783d2c91cbce75d3e685", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "3. Comprehensive tools for efficient document image data annotation and model" - }, - { - "type": "ListItem", - "element_id": "efe6ba3afae54e3c7a05d81583543296", + "element_id": "dc2c331204369d29f5bdcd8dc88a8174", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)" - }, - { - "type": "Title", - "element_id": "50f59772d4134ececeaf37069d480784", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "underlies the off-the-shelf usage" - }, - { - "type": "Title", - "element_id": "c7f4b9a2c7b93fdcc32112de7d9563ba", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "recognition, and other DIA tasks (Section 3)" + "text": "1. An off-the-shelf toolkit for applying DL models for recognition, and other DIA tasks (Section Bp ayout det ection, character 2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage 3. Comprehensive tools for efficient document image tuning to support different levels of customization 4. A DL model hub and community platform for t tion, and discussion of DIA models and pipeline: reproducibility, and extensibility (Section [4) ne easy S. ata annotation and model haring, distribu- s, to promote reusability," }, { "type": "NarrativeText", - "element_id": "9a576fe6eb4355cdf1e772cf462a9eb7", + "element_id": "1f0f5df7c23d4f8e8de4de3085abd7d8", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 2 }, - "text": "tuning to support different levels of customization" + "text": "The library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research." }, { "type": "NarrativeText", @@ -496,7 +436,7 @@ "data_source": {}, "filetype": "application/pdf", "page_number": 5, - "text_as_html": "
Dataset| Base Model'|Large ModelNotes
PubLayNet B8]|F/MMLayouts of modern scientific documents
M-Layouts of scanned modern magazines and scientific reports
F-Layouts of scanned US newspapers from the 20th century
TableBankFFnd business document. Table region on modern scientific
HJDatasetF/M-Layouts of history Japanese documents
" + "text_as_html": "
Dataset| Base Model'|Large Model| Notes
PubLayNet B8]|F/MMLayouts of modern scientific documents
M-Layouts of scanned modern magazines and scientific reports
F-Layouts of scanned US newspapers from the 20th century
TableBankFFnd business document. Table region on modern scientific
HJDatasetF/M-Layouts of history Japanese documents
" }, "text": "Dataset | Base Model'| Large Model | Notes PubLayNet B8]| F/M M Layouts of modern scientific documents PRImA M - nned modern magazines and scientific reports Newspapei F - canned US newspapers from the 20th century TableBank F F Table region on modern scientific and business document HJDataset F/M - Layouts of history Japanese documents" }, @@ -591,34 +531,14 @@ "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component." }, { - "type": "NarrativeText", - "element_id": "65f9f864775ddef6f9895c53e16c50d4", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "1 import layoutparser as lp 2 image = cv2 . imread ( \" image_file \" ) # load images 3 model = lp . De t e c tro n2 Lay outM odel (" - }, - { - "type": "Title", - "element_id": "61b33f079528d200f91471f41645cdc6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "4 5 layout = model . detect ( image )" - }, - { - "type": "NarrativeText", - "element_id": "6cd3a9e132c1264a05ec11a2df6b8066", + "type": "ListItem", + "element_id": "e416e69991bf6a4b338df18ebdb6e712", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 5 }, - "text": "\" lp :// PubLayNet / f as t er _ r c nn _ R _ 50 _ F P N_ 3 x / config \" )" + "text": "import layoutparser as lp image = cv2.imread(\"image_file\") # load images model = lp.Detectron2LayoutModel ( \"1p://PubLayNet/faster_rcnn_R_50_FPN_3x/config\") layout = model.detect (image)" }, { "type": "NarrativeText", @@ -651,14 +571,14 @@ "text": "Z. Shen et al." }, { - "type": "FigureCaption", - "element_id": "185e67615d123b35d38ea72e0cdb6d99", + "type": "Image", + "element_id": "2f498bdd91739a7083490999507420a5", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 6 }, - "text": "- ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff" + "text": "33§3 fectange vada8883 Coordinate83 +*Block | [Block | [Read8 Extra features Tet | [Tye | [oder[ coordinatel textblock1 |» , see383 , textblock2 , layout] ]4A list of the layout elementsThe same transformation and operation APIs" }, { "type": "Title", @@ -1102,14 +1022,14 @@ "text": "9" }, { - "type": "FigureCaption", - "element_id": "975d6cb141cb0a0313375630ae063fa8", + "type": "Image", + "element_id": "6df6057f894a166cf24fd34f64267f09", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 9 }, - "text": "x09 Burpunog uayor Aeydsiq 1 vondo 10g Guypunog usyoy apir:z uondo Mode I: Showing Layout on the Original Image Mode Il: Drawing OCR'd Text at the Correspoding Position" + "text": "a ESStee eaeoooMode I: Showing Layout on the Original ImageMode Il: Drawing OCR'd Text at the Correspoding Position10g Bpunog vayoy feyds1q :1 vondo‘xog Burpunog vay apiH z word" }, { "type": "NarrativeText", @@ -1172,14 +1092,14 @@ "text": "Z. Shen et al." }, { - "type": "FigureCaption", - "element_id": "2680b3c7a55754a3ba2738cb3d9d5e8b", + "type": "Image", + "element_id": "cd0055b04f6049e9d9bf49a4f309f7e9", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 10 }, - "text": "et Intra-column reading order Token Categories tie (Adress 2) tee (NE sumber Variable HEE company type Column Categories (J tite we) adaress —_ (7) section Header by ‘e * Column reading order a a (a) Illustration of the original Japanese Maximum Allowed Height BRE B>e EER eR (b) Illustration of the recreated document with dense text structure for better OCR performance" + "text": "Text‘Token CategoriestieAddress(Numberig:3pio Bupeas uwunjog(a) Illustration of the original Japanese document with detected layout elements highlighted in colored boxesColumn CategoriesCRE) OR REKER te setPikes enceee+41ybiay pamoyy wnwrxey(b) Illustration of the recreated document with dense text structure for better OCR performance" }, { "type": "NarrativeText", @@ -1302,14 +1222,14 @@ "text": "The digitization of historical documents can unlock valuable data that can shed light on many important social, economic, and historical questions. Yet due to scan noises, page wearing, and the prevalence of complicated layout structures, ob- taining a structured representation of historical document scans is often extremely complicated. In this example, LayoutParser was used to develop a comprehensive pipeline, shown in Figure 5, to gener- ate high-quality structured data from historical Japanese firm financial ta- bles with complicated layouts. The pipeline applies two layout models to identify different levels of document structures and two customized OCR engines for optimized character recog- nition accuracy." }, { - "type": "FigureCaption", - "element_id": "b33b2bc3b9c416673c7f74c6a00c49d8", + "type": "Image", + "element_id": "d32d5d93079c0053b7ef655185e47bb4", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "(spe peepee, ‘Active Learning Layout Annotate Layout Dataset | + ‘Annotation Toolkit ¥ a Deep Leaming Layout Model Training & Inference, ¥ ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <—— Default ane Customized ¥ ee Layout Structure Visualization & Export | <—— | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules" + "text": "Annotate Layout Dataset(spe peepee,Active Learning LayoutAnnotation Toolkit4Layout Detection<—Deep Learning LayoutModel Training & Inference,4Post-processin Handy Data Structures &pl 9 APIs for Layout DataText Recognition Default and Customized: r OCR ModelsVisualization & Export |], bayou StructureVisualization & StorageThe Japanese DocumentDigitization PipelineHelpful LayoutParserModules" }, { "type": "NarrativeText", @@ -1323,23 +1243,13 @@ }, { "type": "NarrativeText", - "element_id": "4005fd5e1a8a65c8e989071255cd7386", + "element_id": "de8f09a4156ca73defac521bb354a297", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 11 }, - "text": "15 A document page consists of eight rows like this. For simplicity we skip the row" - }, - { - "type": "Title", - "element_id": "5d0786de7b188a10caffb32c951327a2", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 11 - }, - "text": "segmentation discussion and refer readers to the source code when available." + "text": "& document page consists of eight rows like this. For simplicity we skip the row segmentation discussion and refer readers to the source code when available." }, { "type": "UncategorizedText", @@ -1412,44 +1322,14 @@ "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR." }, { - "type": "NarrativeText", - "element_id": "d11adbfd88959ce24fbfdc7f8155e777", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 12 - }, - "text": "16 This measures the overlap between the detected and ground-truth characters, and" - }, - { - "type": "NarrativeText", - "element_id": "5b6b4f6a5766bdb4f09f0a0387a3a373", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 12 - }, - "text": "the maximum is 1." - }, - { - "type": "NarrativeText", - "element_id": "48033291e6d72fefde1a56827e6dacfb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 12 - }, - "text": "17 This measures the number of edits from the ground-truth text to the predicted text," - }, - { - "type": "NarrativeText", - "element_id": "5737ba23368c5333b0c39f7e8e474d03", + "type": "ListItem", + "element_id": "122f0a4bde97c6e10e95c6e54479e34e", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 12 }, - "text": "and lower is better." + "text": "16 This measures the overlap between the detected and ground-truth characters, and the maximum is 1. '7 This measures the number of edits from the ground-truth text to the predicted text, and lower is better." }, { "type": "Title", @@ -1472,14 +1352,14 @@ "text": "13" }, { - "type": "FigureCaption", - "element_id": "7d42bb6af1404a95a6e8870d5c4d07bf", + "type": "Image", + "element_id": "f58d47bde7ebddd81c4a678c918a8f1b", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 13 }, - "text": "(@) Partial table at the bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line" + "text": "(2) Partial table atthe bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line" }, { "type": "NarrativeText", @@ -1592,84 +1472,14 @@ "text": "References" }, { - "type": "UncategorizedText", - "element_id": "b5bf13691648f2be7e686436513a7366", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man´e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi´egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), https://www.tensorflow.org/, software available from tensorflow.org" - }, - { - "type": "NarrativeText", - "element_id": "098ca0ae774b51e7eba5dbe98641da88", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[2] Alberti, M., Pondenkandath, V., W¨ursch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423–428. IEEE (2018)" - }, - { - "type": "NarrativeText", - "element_id": "0054c11c9691968349806c35f6aa5f0f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296–300. IEEE (2009)" - }, - { - "type": "NarrativeText", - "element_id": "607a64b13da109e96c62ecaedce91c4f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365–9374 (2019)" - }, - { - "type": "UncategorizedText", - "element_id": "9409d20f2ee25336c2566bda8d8bb83c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[5] Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A Large-Scale" - }, - { - "type": "NarrativeText", - "element_id": "44c5093519506610b07942b24d966d77", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "Hierarchical Image Database. In: CVPR09 (2009)" - }, - { - "type": "NarrativeText", - "element_id": "ad1bf75fc53d123c878f8254f9304c9f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 14 - }, - "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)" - }, - { - "type": "NarrativeText", - "element_id": "c6e835fe03323406543926cc0f5a94de", + "type": "ListItem", + "element_id": "af2a971baba0e022d1e53fc0e44b1d94", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 14 }, - "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)" + "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, ot G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Mané, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Viégas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), software available from tensorflow.org Alberti, M., Pondenkandath, V., Wiirsch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423-428. IEEE (2018) Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296-300. IEEE (2009) Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365-9374 (2019) Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A Large-Scale Hierarchical Image Database. In: CVPRO9 (2009) Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980-989. PMLR (2017) Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180-1189. PMLR (2015)" }, { "type": "Title", @@ -1692,164 +1502,14 @@ "text": "15" }, { - "type": "UncategorizedText", - "element_id": "16390873ae6b6a173fc894a873bab022", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[9]" - }, - { - "type": "NarrativeText", - "element_id": "068bf90a7743f50c4a00d4827035e42f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015) [12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the" - }, - { - "type": "NarrativeText", - "element_id": "813cac1316043d454f3c928740435736", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)" - }, - { - "type": "NarrativeText", - "element_id": "2f103adde52e35a8853cbb476720a6ef", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[8] Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) (cid:32)Lukasz Garncarek, Powalski, R., Stanis(cid:32)lawek, T., Topolski, B., Halama, P., Grali´nski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020)" - }, - { - "type": "Title", - "element_id": "4d54eb351d8fc3bfbbf7286aa15eabe3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "IEEE international conference on computer vision. pp. 2961–2969 (2017)" - }, - { - "type": "NarrativeText", - "element_id": "124b6b55da69fccc1c06568bda34f63c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)" - }, - { - "type": "Title", - "element_id": "9b9688203e9cdea89ded788342be4032", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[14] Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J." - }, - { - "type": "UncategorizedText", - "element_id": "e90f44c0e10f9acb4d8f4c5895846d1e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "2007(159), 2 (Jul 2007)" - }, - { - "type": "NarrativeText", - "element_id": "3e0b97d540b7b43ad61292a89a58137f", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)" - }, - { - "type": "NarrativeText", - "element_id": "80498c312fd32cb744e5953dfef18604", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143" - }, - { - "type": "NarrativeText", - "element_id": "09cfad31b28b1315b0bc7bd219136057", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767" - }, - { - "type": "NarrativeText", - "element_id": "be647bda3f1ca1b63554ef22d1313a43", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[18] Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019)" - }, - { - "type": "NarrativeText", - "element_id": "890eb2d0b6b7dbf00a5e0a4ad2f82107", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)" - }, - { - "type": "NarrativeText", - "element_id": "62b12089ccbd0d2dd2f6c292cfa6a6fb", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)" - }, - { - "type": "NarrativeText", - "element_id": "f7cfa7ca2e7175d8bdba9c0cb26a7c98", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 15 - }, - "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)" - }, - { - "type": "NarrativeText", - "element_id": "aae12b8f70e03a3e35015ebda5974ebe", + "type": "ListItem", + "element_id": "ab02ce354f7464ee1d53d58faa93745f", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 15 }, - "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)" + "text": "17 18 19 20 Gardner, M., Grus, J., Neumann, M., Tafjord, O., Dasigi, P., Liu, N., Peters, M., Schmitz, M., Zettlemoyer, L.: Allennlp: A deep semantic natural language processing platform. arXiv preprint arXiv:1803.07640 (2018) Lukasz Garncarek, Powalski, R., Stanistawek, T., Topolski, B., Halama, P., Graliriski, F.: Lambert: Layout-aware (language) modeling using bert for in- formation extraction (2020) Graves, A., Fernandez, $., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369-376 (2006) Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991-995. IEEE (2015) He, K., Gkioxari, G., Dollar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision. pp. 2961-2969 (2017) He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770-778 (2016) Kay, A.: Tesseract: An open-source optical character recognition engine. Linux J. 2007(159), 2 (Jul 2007) Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42-47. IEEE (2011) Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120-122. UIST 20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https: //doi.org/10.1145/3379350.3416143 Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055-3062. Association for Computing Machinery, New York, NY, USA (2020), Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: Table benchmark for image-based table detection and recognition. arXiv preprint arXiv:1903.01949 (2019) Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Dollar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740-755. Springer (2014) Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431-3440 (2015) Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, $., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161-168 (2011) Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7-12. IEEE (2018)" }, { "type": "UncategorizedText", @@ -1872,183 +1532,13 @@ "text": "Z. Shen et al." }, { - "type": "NarrativeText", - "element_id": "1abcfa28cce9b0f5194dec0d534f28e5", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142–147. IEEE (2019)" - }, - { - "type": "NarrativeText", - "element_id": "f7c67eae65521c3a753337d08c5a7cc3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91–99 (2015)" - }, - { - "type": "NarrativeText", - "element_id": "4f43b2e563a35ae0208a8626f7e3280e", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548–549 (2020)" - }, - { - "type": "UncategorizedText", - "element_id": "b66713d3f2d1689f9174e1cb87429eed", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[32] Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning" - }, - { - "type": "UncategorizedText", - "element_id": "10a3ff59f6157f21733e659a41031f83", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[37] Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of" - }, - { - "type": "NarrativeText", - "element_id": "219033258f3fff3de33bed379610c8f3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) [24] Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) [25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257–260. IEEE (2010)" - }, - { - "type": "NarrativeText", - "element_id": "285ce5849d6fd9036e5d16724c024ab9", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572–573 (2020)" - }, - { - "type": "NarrativeText", - "element_id": "a18dcb504d62cb9f8ed4641014b6eeb2", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61–80 (2008) [30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162–1167. IEEE (2017)" - }, - { - "type": "NarrativeText", - "element_id": "da6733a53c75743361e9edcc1d36a20c", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)" - }, - { - "type": "NarrativeText", - "element_id": "385c241b43ef196663b8d30a6b8768ed", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019) [35] Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. https://" - }, - { - "type": "NarrativeText", - "element_id": "d207e2724a17741e3ae1986d63cb5636", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[36] Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020)" - }, - { - "type": "Title", - "element_id": "93d261a89a8422fb8d166e6cdf95d8f6", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "github.com/facebookresearch/detectron2 (2019)" - }, - { - "type": "NarrativeText", - "element_id": "9dce913bddaa63724f5de64e539b7016", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "based layout annotation. arXiv preprint arXiv:2010.01762 (2020)" - }, - { - "type": "Title", - "element_id": "2625b6830768eac986cfee208c0270de", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "text and layout for document image understanding (2019)" - }, - { - "type": "Title", - "element_id": "21d399ba787aabbf69a8ca861cbcc4a3", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet:" - }, - { - "type": "Title", - "element_id": "462753569cb801c6f858759742a93793", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "ument Analysis and Recognition (ICDAR). pp. 1015–1022. https://doi.org/10.1109/ICDAR.2019.00166" - }, - { - "type": "Title", - "element_id": "c7fc0ade487926854bb602bca85fad60", - "metadata": { - "data_source": {}, - "filetype": "application/pdf", - "page_number": 16 - }, - "text": "layout analysis." - }, - { - "type": "UncategorizedText", - "element_id": "96c49c3fbbb585f8062778e9a404b00f", + "type": "ListItem", + "element_id": "993f472d953f5d0e4054f1d4ad6fc4f0", "metadata": { "data_source": {}, "filetype": "application/pdf", "page_number": 16 }, - "text": "largest dataset ever for doc- In: 2019 International Conference on Document IEEE (Sep 2019)." + "text": "23 github. com/facebookresearch/detectron2) (2019) Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017) Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al.: Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703 (2019) Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257-260. IEEE (2010) Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572-573 (2020) Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142-147. IEEE (2019) Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91-99 (2015) Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61-80 (2008) Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017) Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548-549 (2020) Shen, Z., Zhao, J., Dell, M., Yu, Y., Li, W.: Olala: Object-level active learning based layout annotation. arXiv preprint arXiv:2010.01762 (2020) Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720-725. IEEE (2019) Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019) Wu, Y., Kirillov, A., Massa, F., Lo, W.Y., Girshick, R.: Detectron2. Xu, Y., Xu, Y., Lv, T., Cui, L., Wei, F., Wang, G., Lu, Y., Florencio, D., Zhang, C., Che, W., et al.: Layoutlmv2: Multi-modal pre-training for visually-rich document understanding. arXiv preprint arXiv:2012.14740 (2020) Xu, Y., Li, M., Cui, L., Huang, S., Wei, F., Zhou, M.: Layoutlm: Pre-training of text and layout for document image understanding (2019) Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for doc- ument layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (Sep 2019). https: //doi.org/10.1109/ICDAR.2019.00166" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json index 4baf9be5a6..a8cc14e267 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json @@ -91,7 +91,7 @@ }, { "type": "ListItem", - "element_id": "f1d5f4ed63a14db581e985bf15416cdd", + "element_id": "4f0cdff19ccd9010b64eff87ced8e0b7", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -105,79 +105,7 @@ "filetype": "application/pdf", "page_number": 2 }, - "text": "Global growth is projected to fall from an estimated 3.4 percent in 2022 to 2.9 percent in 2023, then rise to 3.1 percent in 2024. The forecast for 2023 is 0.2 percentage point higher than predicted in the October 2022 World Economic Outlook (WEO) but below the historical (2000–19) average of 3.8 percent. The rise in central bank rates to fight inflation and Russia’s war in Ukraine continue to weigh on economic activity. The rapid spread of COVID-19 in China dampened growth in 2022, but the recent reopening has paved the way for a faster-than-expected recovery. Global inflation is expected to fall from 8.8 percent in 2022 to 6.6 percent in 2023 and 4.3 percent in 2024, still above pre-pandemic (2017–19) levels of about 3.5 percent." - }, - { - "type": "ListItem", - "element_id": "c4e0168ffab999611a92e8ebd8fe48a9", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022" - }, - { - "type": "NarrativeText", - "element_id": "74180a93b38b6808f8cff7439e5d16d2", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress." - }, - { - "type": "ListItem", - "element_id": "5e9b501fc056965a744f6598d022f31d", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "In most economies, amid the cost-of-living crisis, the priority remains achieving sustained disinflation. With" - }, - { - "type": "NarrativeText", - "element_id": "9f5a3fe548f011e304fda9067caa0824", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 2 - }, - "text": "tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment." + "text": "© Global growth is projected to fall from an estimated 3.4 percent in 2022 to 2.9 percent in 2023, then rise to 3.1 percent in 2024. The forecast for 2023 is 0.2 percentage point higher than predicted in the October 2022 World Economic Outlook (WEO) but below the historical (2000-19) average of 3.8 percent. The rise in central bank rates to fight inflation and Russia’s war in Ukraine continue to weigh on economic activity. The rapid spread of COVID-19 in China dampened growth in 2022, but the recent reopening has paved the way Jor a faster-than-expected recovery. Global inflation is expected to fall from 8.8 percent in 2022 to 6.6 percent in 2023 and 4.3 percent in 2024, still above pre-pandemic (2017-19) levels of about 3.5 percent. © = The balance of risks remains tilted to the downside, but adverse risks have moderated since the October 2022 WEO. On the upside, a stronger boost from pent-up demand in numerous economies or a faster fall in inflation are plausible. On the downside, severe health outcomes in China could hold back the recovery, Russia’s war in Ukraine could escalate, and tighter global financing conditions could worsen debt distress. Financial markets could also suddenly reprice in response to adverse inflation news, while further geopolitical fragmentation could hamper economic progress. © In most economies, amid the cost-of-living crisis, the priority remains achieving sustained disinflation. With tighter monetary conditions and lower growth potentially affecting financial and debt stability, it is necessary to deploy macroprudential tools and strengthen debt restructuring frameworks. Accelerating COVID-19 vaccinations in China would safeguard the recovery, with positive cross-border spillovers. Fiscal support should be better targeted at those most affected by elevated food and energy prices, and broad-based fiscal relief measures should be withdrawn. Stronger multilateral cooperation is essential to preserve the gains from the rules-based multilateral system and to mitigate climate change by limiting emissions and raising green investment." }, { "type": "Title", @@ -1097,117 +1025,9 @@ }, "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, - { - "type": "NarrativeText", - "element_id": "70f05b9620aa1b7236058898e7e59192", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "economies. There is a 0.4 percentage point upward revision for annual growth in 2023, reflecting carryover effects from domestic demand resilience in 2022, but a 0.2 percentage point downward revision of growth in 2024 due to the steeper path of Federal Reserve rate hikes, to a peak of about 5.1 percent in 2023." - }, - { - "type": "ListItem", - "element_id": "fd6c549473e196512c076844988f465c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "Growth in the euro area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6" - }, - { - "type": "NarrativeText", - "element_id": "cdcaed7d1296edd658256d603cb3828c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers." - }, - { - "type": "ListItem", - "element_id": "3be6554964c172468cceaee89294f59d", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "Growth in the United Kingdom is projected to be –0.6 percent in 2023, a 0.9 percentage point" - }, - { - "type": "NarrativeText", - "element_id": "7e32067b6a4662d72b1244a3aac91be5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets." - }, { "type": "ListItem", - "element_id": "b24771387a5318eeda21adaa49629186", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal" - }, - { - "type": "NarrativeText", - "element_id": "f8b94e8d9a593a1debae96fce2040db7", + "element_id": "becf96ae2fa1045c14996c3de7a05bb8", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1221,7 +1041,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate." + "text": "economies. There is a 0.4 percentage point upward revision for annual growth in 2023, reflecting carryover effects from domestic demand resilience in 2022, but a 0.2 percentage point downward revision of growth in 2024 due to the steeper path of Federal Reserve rate hikes, to a peak of about 5.1 percent in 2023. Growth in the ero area is projected to bottom out at 0.7 percent in 2023 before rising to 1.6 percent in 2024. The 0.2 percentage point upward revision to the forecast for 2023 reflects the effects of faster rate hikes by the European Central Bank and eroding real incomes, offset by the carryover from the 2022 outturn, lower wholesale energy prices, and additional announcements of fiscal purchasing power support in the form of energy price controls and cash transfers. Growth in the United Kingdom is projected to be —0.6 percent in 2023, a 0.9 percentage point downward revision from October, reflecting tighter fiscal and monetary policies and financial conditions and still-high energy retail prices weighing on household budgets. Growth in Japan is projected to rise to 1.8 percent in 2023, with continued monetary and fiscal policy support. High corporate profits from a depreciated yen and earlier delays in implementing previous projects will support business investment. In 2024, growth is expected to decline to 0.9 percent as the effects of past stimulus dissipate." }, { "type": "NarrativeText", @@ -1259,45 +1079,9 @@ }, "text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2" }, - { - "type": "NarrativeText", - "element_id": "237bc02ecaaf27f074be0c466b31cc09", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to rise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024." - }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "" - }, { "type": "ListItem", - "element_id": "afde979c99a73646915fe253c85c5a9c", + "element_id": "bba948699d4f21aaf5001520bb796e17", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1311,7 +1095,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Russia in 2022 (estimated at –2.2 percent compared with a predicted –3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgrades of 0.2 percentage point for Brazil and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" + "text": "Growth in emerging and developing Asia is expected to rise in 2023 and 2024 to 5.3 percent and 5.2 percent, respectively, after the deeper-than-expected slowdown in 2022 to 4.3 percent attributable to China’s economy. China’s real GDP slowdown in the fourth quarter of 2022 implies a 0.2 percentage point downgrade for 2022 growth to 3.0 percent—the first time in more than 40 years with China’s growth below the global average. Growth in China is projected to tise to 5.2 percent in 2023, reflecting rapidly improving mobility, and to fall to 4.5 percent in 2024 before settling at below 4 percent over the medium term amid declining business dynamism and slow progress on structural reforms. Growth in India is set to decline from 6.8 percent in 2022 to 6.1 percent in 2023 before picking up to 6.8 percent in 2024, with resilient domestic demand despite external headwinds. Growth in the ASEAN-5 countries (Indonesia, Malaysia, Philippines, Singapore, Thailand) is similarly projected to slow to 4.3 percent in 2023 and then pick up to 4.7 percent in 2024. Growth in emerging and developing Europe is projected to have bottomed out in 2022 at 0.7 percent and, since the October forecast, has been revised up for 2023 by 0.9 percentage point to 1.5 percent. This reflects a smaller economic contraction in Rwssia in 2022 (estimated at —2.2 percent compared with a predicted —3.4 percent) followed by modestly positive growth in 2023. At the current oil price cap level of the Group of Seven, Russian crude oil export volumes are not expected to be significantly affected, with Russian trade continuing to be redirected from sanctioning to non-sanctioning countries. In Latin America and the Caribbean, growth is projected to decline from 3.9 percent in 2022 to 1.8 percent in 2023, with an upward revision for 2023 of 0.1 percentage point since October. The forecast revision reflects upgtades of 0.2 percentage point for Brazi/ and 0.5 percentage point for Mexico due to unexpected domestic demand resilience, higher-than-expected growth in" }, { "type": "UncategorizedText", @@ -1367,63 +1151,9 @@ }, "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, - { - "type": "NarrativeText", - "element_id": "e7a8e30d6d49ffbca56f87cd6883c9a0", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "major trading partner economies, and in Brazil, greater-than-expected fiscal support. Growth in the region is projected to rise to 2.1 percent in 2024, although with a downward revision of 0.3 percentage point, reflecting tighter financial conditions, lower prices of exported commodities, and downward revisions to trading partner growth." - }, - { - "type": "Title", - "element_id": "3f79bb7b435b05321651daefd374cdc6", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "e" - }, { "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 6 - }, - "text": "" - }, - { - "type": "NarrativeText", - "element_id": "25e2f1dc031b5421b8a234945098e58b", + "element_id": "e0fc62fcfa1add3cf912fbaf3e0c9ba1", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -1437,7 +1167,7 @@ "filetype": "application/pdf", "page_number": 6 }, - "text": "Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints." + "text": " Growth in the Middle East and Central Asia is projected to decline from 5.3 percent in 2022 to 3.2 percent in 2023, with a downward revision of 0.4 percentage point since October, mainly attributable to a steeper-than-expected growth slowdown in Saudi Arabia, from 8.7 percent in 2022 (which was stronger than expected by 1.1 percentage points) to 2.6 percent in 2023, with a negative revision of 1.1 percentage points. The downgrade for 2023 reflects mainly lower oil production in line with an agreement through OPEC+ (Organization of the Petroleum Exporting Countries, including Russia and other non-OPEC oil exporters), while non-oil growth is expected to remain robust. In sub-Saharan Africa, growth is projected to remain moderate at 3.8 percent in 2023 amid prolonged fallout from the COVID-19 pandemic, although with a modest upward revision since October, before picking up to 4.1 percent in 2024. The small upward revision for 2023 (0.1 percentage point) reflects Nigeria’s rising growth in 2023 due to measures to address insecurity issues in the oil sector. In South Africa, by contrast, after a COVID-19 reopening rebound in 2022, projected growth more than halves in 2023, to 1.2 percent, reflecting weaker external demand, power shortages, and structural constraints." }, { "type": "Title", @@ -3420,8 +3150,8 @@ "text": "WORLD ECONOMIC OUTLOOK UPDATE, JANUARY 2023" }, { - "type": "NarrativeText", - "element_id": "d379a79a55cecddeed62b21eb6a0ff00", + "type": "ListItem", + "element_id": "79a6a9353dc2a500e2e50e720cf8ab7c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3435,11 +3165,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "However, the boost to demand could stoke core inflation, leading to even tighter monetary policies and a stronger-than-expected slowdown later on. Pent-up demand could also fuel a stronger rebound in China." + "text": "However, the boost to demand could stoke core inflation, leading to even tighter monetary policies and a stronger-than-expected slowdown later on. Pent-up demand could also fuel a stronger rebound in China. e Faster disinflation: An easing in labor market pressures in some advanced economies due to falling vacancies could cool wage inflation without necessarily increasing unemployment. A sharp fall in the prices of goods, as consumers shift back to services, could further push down inflation. Such developments could imply a “softer” landing with less monetary tightening." }, { - "type": "ListItem", - "element_id": "2bbe57e6c291db638d3fcddca9e0199a", + "type": "NarrativeText", + "element_id": "a2f806b25a06969405637298b4c85139", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3453,11 +3183,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "Faster disinflation: An easing in labor market pressures in some advanced economies due to" + "text": "Downside risks—Numerous downside risks continue to weigh on the global outlook, lowering growth while, in a number of cases, adding further to inflation:" }, { - "type": "NarrativeText", - "element_id": "3f9155fad634c620bd9b820132e20935", + "type": "ListItem", + "element_id": "e9fbac47e4ed0c2d153022a284a77919", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3471,191 +3201,11 @@ "filetype": "application/pdf", "page_number": 8 }, - "text": "falling vacancies could cool wage inflation without necessarily increasing unemployment. A sharp fall in the prices of goods, as consumers shift back to services, could further push down inflation. Such developments could imply a “softer” landing with less monetary tightening." + "text": "© = China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems. e = =War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing ptice spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase. e Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. e = Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy. e = Sudden financial market repricing: A prematute easing in financial conditions in response to lower headline inflation data could complicate anti-inflation policies and necessitate additional monetary tightening. For the same reason, unfavorable inflation data releases could trigger sudden repricing of assets and increase volatility in financial markets. Such movements could strain liquidity and the functioning of critical markets, with ripple effects on the real economy. © Geopolitical fragmentation: The wat in Ukraine and the related international sanctions aimed at e pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing earlier geopolitical tensions, such as those associated with the US-China trade dispute." }, { - "type": "NarrativeText", - "element_id": "a2f806b25a06969405637298b4c85139", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Downside risks—Numerous downside risks continue to weigh on the global outlook, lowering growth while, in a number of cases, adding further to inflation:" - }, - { - "type": "ListItem", - "element_id": "90a90e12a4c6b8b74d3c8d20a76f22dc", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "China’s recovery stalling: Amid still-low population immunity levels and insufficient hospital" - }, - { - "type": "NarrativeText", - "element_id": "1bbcee85386321e6e8235a64d4c34d73", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "capacity, especially outside the major urban areas, significant health consequences could hamper the recovery. A deepening crisis in the real estate market remains a major source of vulnerability, with risks of widespread defaults by developers and resulting financial sector instability. Spillovers to the rest of the world would operate primarily through lower demand and potentially renewed supply chain problems." - }, - { - "type": "ListItem", - "element_id": "42ac57e394bf7c98d908745cefce0b80", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "War in Ukraine escalating: An escalation of the war in Ukraine remains a major source of" - }, - { - "type": "NarrativeText", - "element_id": "fdb59d523afa92db3942dabc88d94fc4", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "vulnerability, particularly for Europe and lower-income countries. Europe is facing lower-than- anticipated gas prices, having stored enough gas to make shortages unlikely this winter. However, refilling storage with much-diminished Russian flows will be challenging ahead of next winter, particularly if it is a very cold one and China’s energy demand picks up, causing price spikes. A possible increase in food prices from a failed extension of the Black Sea grain initiative would put further pressure on lower-income countries that are experiencing food insecurity and have limited budgetary room to cushion the impact on households and businesses. With elevated food and fuel prices, social unrest may increase." - }, - { - "type": "ListItem", - "element_id": "e3b0c44298fc1c149afbf4c8996fb924", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "" - }, - { - "type": "ListItem", - "element_id": "2d14934d52ff357c52e9ae1c38f7390e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Debt distress: Since October, sovereign spreads for emerging market and developing economies have modestly declined on the back of an easing in global financial conditions (Box 1) and dollar depreciation. About 15 percent of low-income countries are estimated to be in debt distress, with an additional 45 percent at high risk of debt distress and about 25 percent of emerging market economies also at high risk. The combination of high debt levels from the pandemic, lower growth, and higher borrowing costs exacerbates the vulnerability of these economies, especially those with significant near-term dollar financing needs. Inflation persisting: Persistent labor market tightness could translate into stronger-than-expected wage growth. Higher-than-expected oil, gas, and food prices from the war in Ukraine or from a faster rebound in China’s growth could again raise headline inflation and pass through into underlying inflation. Such developments could cause inflation expectations to de-anchor and require an even tighter monetary policy." - }, - { - "type": "ListItem", - "element_id": "33ccff3014b460178e62d9c8021fd728", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Sudden financial market repricing: A premature easing in financial conditions in response to lower headline inflation data could complicate anti-inflation policies and necessitate additional monetary tightening. For the same reason, unfavorable inflation data releases could trigger sudden repricing of assets and increase volatility in financial markets. Such movements could strain liquidity and the functioning of critical markets, with ripple effects on the real economy." - }, - { - "type": "ListItem", - "element_id": "75bd22ee0ba778cc3a616ed0a9b42292", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "Geopolitical fragmentation: The war in Ukraine and the related international sanctions aimed at  pressuring Russia to end hostilities are splitting the world economy into blocs and reinforcing" - }, - { - "type": "NarrativeText", - "element_id": "810e5a86eae657e179ac8da86f317a62", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 8 - }, - "text": "earlier geopolitical tensions, such as those associated with the US-China trade dispute." - }, - { - "type": "Title", - "element_id": "8ae18586f23aa212e66aeb12a5638609", + "type": "Title", + "element_id": "8ae18586f23aa212e66aeb12a5638609", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -3961,97 +3511,7 @@ }, { "type": "ListItem", - "element_id": "bd7674df887463bc9f05c8030a151dea", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global" - }, - { - "type": "NarrativeText", - "element_id": "cb704f1b6d23bfe23f6b4109c471ac8b", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential.  Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non– Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes." - }, - { - "type": "ListItem", - "element_id": "af6eef18ec41f4980c1a4cbb5b7d4fec", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Strengthening global trade: Strengthening the global trading system would address risks associated" - }, - { - "type": "Title", - "element_id": "0695b563acde461fc2f8d9aebccf35c7", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "with" - }, - { - "type": "NarrativeText", - "element_id": "e6f343736720ae4f9bf5202294c7c9fc", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system." - }, - { - "type": "ListItem", - "element_id": "d6f6afcf055ed3084a0fac1093458c88", + "element_id": "8dbc8ad2da37799a3719a01d44d2e506", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4065,43 +3525,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Using the global financial safety net: With the cascading of shocks to the global economy, using the global financial safety net to its fullest extent is appropriate, including by proactively utilizing the IMF’s precautionary financial arrangements and channeling aid from the international community to low-income countries facing shocks." - }, - { - "type": "ListItem", - "element_id": "089c5759e7030e34a3b537d9e20bcd13", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Speeding the green transition: To meet governments’ climate change goals, it is necessary to swiftly" - }, - { - "type": "NarrativeText", - "element_id": "77ac1fdd449fba59a90d978745964463", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", - "version": 265756457651539296174748931590365722430, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" - }, - "date_modified": "2023-02-14T07:31:28" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries." + "text": "e = Restraining the pandemic: Global coordination is needed to resolve bottlenecks in the global distribution of vaccines and treatments. Public support for the development of new vaccine technologies and the design of systematic responses to future epidemics also remains essential. e = Addressing debt distress: Progress has been made for countries that requested debt treatment under the Group of Twenty’s Common Framework initiative, and more will be needed to strengthen it. It is also necessary to agree on mechanisms to resolve debt in a broader set of economies, including middle-income countries that are not eligible under the Common Framework. Non— Paris Club and private creditors have a crucial role to play in ensuring coordinated, effective, and timely debt resolution processes. e — Strengthening global trade: Strengthening the global trading system would address risks associated with trade fragmentation. This can be achieved by rolling back restrictions on food exports and other essential items such as medicine, upgrading World Trade Organization (WTO) rules in critical areas such as agricultural and industrial subsidies, concluding and implementing new WTO-based agreements, and fully restoring the WTO dispute settlement system. e Using the global financial safety net: With the cascading of shocks to the global economy, using the global financial safety net to its fullest extent is appropriate, including by proactively utilizing the IMF’s precautionary financial arrangements and channeling aid from the international community to low-income countries facing shocks. e Speeding the green transition: To meet governments’ climate change goals, it is necessary to swiftly implement credible mitigation policies. International coordination on carbon pricing or equivalent policies would facilitate faster decarbonization. Global cooperation is needed to build resilience to climate shocks, including through aid to vulnerable countries." }, { "type": "Title", @@ -4644,8 +4068,8 @@ "text": "Slowing aggregate demand and weaker-than-expected inflation prints in some major advanced economies have prompted investors’ anticipation of a further reduction in the pace of future policy rate hikes. Corporate earnings forecasts have been cut due to headwinds from slowing demand, and margins have contracted across most regions. In addition, survey-based probabilities of recession have been increasing, particularly in the United States and Europe. However, upside risks to the inflation outlook remain. Despite the recent moderation in headline inflation, core inflation remains stubbornly high across most regions, labor markets are still tight, energy prices remain pressured by Russia’s ongoing war in Ukraine, and supply chain disruptions may reappear. To keep these risks in check, financial conditions will likely need to tighten further. If not, central banks may need to increase policy rates even more in order to achieve their inflation objectives." }, { - "type": "NarrativeText", - "element_id": "261bebc8fb9b3ed5146d23644639bc26", + "type": "UncategorizedText", + "element_id": "e7f6c011776e8db7cd330b54174fd76f", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4659,11 +4083,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need © —— Sources: Bloomberg Finance L.P.; and IMF staff calculations. Note: GFSR = Global Financial Stability Report. to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess theit outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked." + "text": "6" }, { "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4677,11 +4101,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "3" + "text": "5" }, { "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4695,11 +4119,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "1" + "text": "4" }, { "type": "UncategorizedText", - "element_id": "e7f6c011776e8db7cd330b54174fd76f", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4713,7 +4137,7 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "6" + "text": "3" }, { "type": "UncategorizedText", @@ -4735,7 +4159,7 @@ }, { "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4749,11 +4173,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "4" + "text": "1" }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "Title", + "element_id": "6ef230728534d871e5126e2a55e12b26", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4767,11 +4191,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "5" + "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" }, { - "type": "Title", - "element_id": "49cf8421218222b21a0fc54ffce584c9", + "type": "UncategorizedText", + "element_id": "3e48114b7946f4dd7a12ae0b2c1121af", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4785,11 +4209,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 22" + "text": "© ——" }, { - "type": "Title", - "element_id": "6ef230728534d871e5126e2a55e12b26", + "type": "ListItem", + "element_id": "7d4f55875c970d850a152ba1d5ba02a5", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4803,11 +4227,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Figure 1.2. Market-Implied Expectations of Policy Rates (Percent)" + "text": "1. United States" }, { "type": "Title", - "element_id": "24a234895630131d612fc1b4605a256e", + "element_id": "8730d3c2022abf1f9665e4ca1da43e4d", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4821,11 +4245,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Apr. 23" + "text": "Latest" }, { - "type": "ListItem", - "element_id": "7d4f55875c970d850a152ba1d5ba02a5", + "type": "Title", + "element_id": "53d79cec96694df67ce3baff95d8a2e3", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4839,11 +4263,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "1. United States" + "text": "October 2022 GFSR" }, { - "type": "Title", - "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", + "type": "ListItem", + "element_id": "8e655408cf212df5f74df13e05cdf02c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4857,11 +4281,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 23" + "text": "2. Euro area" }, { - "type": "Title", - "element_id": "8730d3c2022abf1f9665e4ca1da43e4d", + "type": "UncategorizedText", + "element_id": "ef2d127de37b942baad06145e54b0c61", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4875,11 +4299,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Latest" + "text": "5" }, { - "type": "Title", - "element_id": "d8478f45b9790d52201238244d0e9698", + "type": "UncategorizedText", + "element_id": "4b227777d4dd1fc61c6f884f48641d02", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4893,11 +4317,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 24" + "text": "4" }, { - "type": "Title", - "element_id": "fe1cc1c654c8a4fde402cfe2426326ef", + "type": "UncategorizedText", + "element_id": "4e07408562bedb8b60ce05c1decfe3ad", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4911,11 +4335,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Dec. 26" + "text": "3" }, { - "type": "Title", - "element_id": "49cf8421218222b21a0fc54ffce584c9", + "type": "UncategorizedText", + "element_id": "d4735e3a265e16eee03f59718b9b5d03", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4929,11 +4353,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "Oct. 22" + "text": "2" }, { - "type": "Title", - "element_id": "53d79cec96694df67ce3baff95d8a2e3", + "type": "UncategorizedText", + "element_id": "6b86b273ff34fce19d6b804eff5a3f57", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4947,11 +4371,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "October 2022 GFSR" + "text": "1" }, { - "type": "ListItem", - "element_id": "8e655408cf212df5f74df13e05cdf02c", + "type": "Title", + "element_id": "49cf8421218222b21a0fc54ffce584c9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -4965,7 +4389,7 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "2. Euro area" + "text": "Oct. 22" }, { "type": "Title", @@ -5040,8 +4464,8 @@ "text": "Dec. 26" }, { - "type": "UncategorizedText", - "element_id": "ef2d127de37b942baad06145e54b0c61", + "type": "Title", + "element_id": "49cf8421218222b21a0fc54ffce584c9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5055,11 +4479,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "5" + "text": "Oct. 22" }, { - "type": "UncategorizedText", - "element_id": "4b227777d4dd1fc61c6f884f48641d02", + "type": "Title", + "element_id": "24a234895630131d612fc1b4605a256e", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5073,11 +4497,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "4" + "text": "Apr. 23" }, { - "type": "UncategorizedText", - "element_id": "4e07408562bedb8b60ce05c1decfe3ad", + "type": "Title", + "element_id": "914e31edcbd035dbe9f1cfb7b29089a9", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5091,11 +4515,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "3" + "text": "Oct. 23" }, { - "type": "UncategorizedText", - "element_id": "d4735e3a265e16eee03f59718b9b5d03", + "type": "Title", + "element_id": "d8478f45b9790d52201238244d0e9698", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5109,11 +4533,11 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "2" + "text": "Dec. 24" }, { - "type": "UncategorizedText", - "element_id": "6b86b273ff34fce19d6b804eff5a3f57", + "type": "Title", + "element_id": "fe1cc1c654c8a4fde402cfe2426326ef", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", @@ -5127,7 +4551,25 @@ "filetype": "application/pdf", "page_number": 11 }, - "text": "1" + "text": "Dec. 26" + }, + { + "type": "NarrativeText", + "element_id": "2dd1b91ebd6543b4902626a579552919", + "metadata": { + "data_source": { + "url": "s3://utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf", + "version": 265756457651539296174748931590365722430, + "record_locator": { + "protocol": "s3", + "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/2023-Jan-economic-outlook.pdf" + }, + "date_modified": "2023-02-14T07:31:28" + }, + "filetype": "application/pdf", + "page_number": 11 + }, + "text": "Given the tension between rising recession risks and monetary policy uncertainty, markets have seen significant volatility. While many central banks in advanced economies have stepped down the size of hikes, they have also explicitly stated they will need to keep rates higher, for a longer period of time, to tamp down inflation. Risk assets could face significant declines if earnings retrench further or if investors reassess their outlook for monetary policy given central bank communications. Globally, the partial reversal of the dollar rally has contributed to recent easing due to improved risk appetite, and some emerging market central banks have paused tightening amid tentative signs that inflation may have peaked." }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json index ef0bcde4d8..fc40f88495 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/Silent-Giant-(1).pdf.json @@ -199,7 +199,7 @@ }, { "type": "ListItem", - "element_id": "9c4387f669c689e9af0a712fd494b2d7", + "element_id": "e18242a460d9d495ea7cffee38c1e647", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -213,43 +213,7 @@ "filetype": "application/pdf", "page_number": 3 }, - "text": "The need for harmony in the nuclear regulatory environment" - }, - { - "type": "ListItem", - "element_id": "93e7dedc9d334470067ad2de1f9ee788", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "The need for a holistic safety paradigm for the whole electricity system." - }, - { - "type": "ListItem", - "element_id": "3cc3e847449fed4fa13bbd94f86e43a9", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 3 - }, - "text": "The need to create a level playing field that values reliability and energy security" + "text": "° The need to create a level playing field that values reliability and energy security ° The need for harmony in the nuclear regulatory environment ° The need for a holistic safety paradigm for the whole electricity system." }, { "type": "UncategorizedText", @@ -3439,7 +3403,7 @@ }, { "type": "NarrativeText", - "element_id": "338d3e15917414641f2b559473f168f8", + "element_id": "0ad07326f56e66781da5dbb9488eaa67", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3453,7 +3417,7 @@ "filetype": "application/pdf", "page_number": 9 }, - "text": "Figure 6. The lasting decarbonization of French electricity and nuclear’s ability to meet growing demand x" + "text": "Figure 6. The lasting decarbonization of French electricity and nuclear’s ability to meet growing demand”" }, { "type": "NarrativeText", @@ -3474,8 +3438,8 @@ "text": "The incredible energy density of uranium means that just a few kilos is all that is required to provide one person with enough power for a lifetime. Uranium is abundant and can be found in many parts of the world, as well as in seawater. Furthermore, spent nuclear fuel is well managed and can in most cases be recycled to produce even more power. By using nuclear energy, countries are able to take charge of their own destinies by decreasing their reliance on imported energy – enhanced independence and security in uncertain times." }, { - "type": "FigureCaption", - "element_id": "eeda9f9210dfe4be7e82b4385290d3ca", + "type": "Image", + "element_id": "36ca9b7cdbbcba729a46487cf86c07eb", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3636,224 +3600,8 @@ "text": "i" }, { - "type": "Title", - "element_id": "5d7f49449ab22deac22d767b89549c55", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "ii" - }, - { - "type": "Title", - "element_id": "f5557d4fcf727a981a3c315aca733eef", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "iii" - }, - { - "type": "Title", - "element_id": "0ab306823035661bb8dba21cc2535231", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "iv" - }, - { - "type": "Title", - "element_id": "d3fc2842ddfad4c8d3859f84d4439bfd", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Vv" - }, - { - "type": "Title", - "element_id": "4c94485e0c21ae6c41ce1dfe7b6bface", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "v" - }, - { - "type": "Title", - "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "vi" - }, - { - "type": "Title", - "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "vi" - }, - { - "type": "Title", - "element_id": "c1d2906220d1eef1b17422b7132872a8", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "vii" - }, - { - "type": "NarrativeText", - "element_id": "de72de35f0092bdd3107011f3be18dc0", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "International Energy Agency (2018), World Energy Outlook 2018. Data accessed from https://www.iea.org/weo/ – Based on the New Policies Scenario, which incorporates existing energy policies as well as an assessment of the results likely to stem from the implementation of announced policy intentions – with visual modification by World Nuclear Association. International Energy Agency (n.d.), Statistics. Accessed from: https://www.iea.org/statistics/?country=WORLD&year=2016&category=Electricity&indicator=ElecGenByFuel&mode =chart&dataTable=ELECTRICITYANDHEAT – with visual modifications by World Nuclear Association. International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/ Intergovernmental Panel on Climate Change (2018), Special Report on Global Warming of 1.5 °C. Accessed from: https://www.ipcc.ch/sr15/ International Energy Agency (2019), Nuclear Power in a Clean Energy System. Accessed from: https://www.iea.org/ publications/nuclear/ International Energy Agency & OECD Nuclear Energy Agency (2015), Projected Costs of generating Electricity – 2015 Edition. Accessed from: https://www.oecd-nea.org/ndd/pubs/2015/7057-proj-costs-electricity-2015.pdf International Atomic Energy Agency (2015), Technical challenges in the application and licensing of digital instrumentation and control systems in nuclear power plants. Accessed from: https://www-pub.iaea.org/MTCD/ Publications/PDF/P1695_web.pdf" - }, - { - "type": "NarrativeText", - "element_id": "b6396ecd6f60e3dcca17c045c00846c1", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "viii Paul-Scherrer Institute. Data for nuclear accidents modified to reflect UNSCEAR findings/recommendations (2012)" - }, - { - "type": "Title", - "element_id": "ed171375d0bf81eaa5512140c3a29b8f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "ix" - }, - { - "type": "Title", - "element_id": "2d711642b726b04401627ca9fbac32f5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", - "version": 177372694731575984083482917563244941766, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf" - }, - "date_modified": "2023-02-12T10:10:36" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "x" - }, - { - "type": "UncategorizedText", - "element_id": "5897aff759a5cc8d94710101c73af296", + "type": "ListItem", + "element_id": "ffc47b19bb43cce8c23421b5c78b17b4", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/Silent-Giant-(1).pdf", @@ -3867,7 +3615,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "and NRC SOARCA study 2015 International Energy Agency (2018), Electricity Information 2018 https://webstore.iea.org/electricity-information-2018-overview Ibid." + "text": "i nternational Energy Agency (20 results Nuclear Association. ii nternational iii nternational Energy Agency (20 publications/nuclear/ 8), World Energy Outloo! Energy Agency (n.d.), Statistics. Accessed from: https://www.iea.org/statistics/?country=>WORLD&year=20 =chart&dataTable=ELECTRICITYANDHEAT - with visual modifications by World Nuclear Association. 9), Nuclear Power in a CI 2018. Data accessed from https://www.iea.org/weo/ — Based on the New Policies Scenario, which incorporates existing energy policies as well as an assessment of the ikely to stem from the implementation of announced policy intentions — with visual modification by World 6&category=Electricity&indicator=ElecGenByFuel&mode lean Energy System. Accessed from: https://www.iea.org/ iv Intergovernmental Panel on Climate Change (2018), Special Report on Global Warming of 1.5 °C. Accessed from: https:/Awww.ipce.ch/sr15/ Vv nternational Energy Agency (20 publications/nuclear/ vi nternational vii International Publications/PDF/P1695_web.pdf and NRC SOARCA study 2015 ix nternational x bid. 9), Nuclear Power in a CI Energy Agency & OECD Nuclear Energy Agency (2015), Projected Costs o 2015 Edition. Accessed from: https:/Awww.oecd-nea.org/ndd/pubs/2015/7057-proj-costs-electricity-2015.pdf Atomic Energy Agency (2015), Technical challenges in the application and instrumentation and control systems in nuclear power plants. Accessed from: https://www-pub.iaea.org/MTCD/ Energy Agency (2018), Electricity Information 2018 https://webstore.iea.org/electricity-information-2018-overview lean Energy System. Accessed from: https://www.iea.org/ generating Electricity — icensing of digital Paul-Scherrer Institute. Data for nuclear accidents modified to reflect UNSCEAR findings/recommendations (2012)" }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json index a0b176312b..715b8fc617 100644 --- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json @@ -1549,7 +1549,7 @@ }, { "type": "NarrativeText", - "element_id": "8921c0f3c29bc04c22c9c40f4eef6613", + "element_id": "a9d31d88b0e2026dbed12c8b5536ab2b", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1563,7 +1563,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution3" + "text": "Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution®" }, { "type": "NarrativeText", @@ -1585,7 +1585,7 @@ }, { "type": "NarrativeText", - "element_id": "e450813fe6430d87c4caa64e4792bc74", + "element_id": "1ff44442b3a554331aaf4ffb30b7eda6", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -1599,25 +1599,7 @@ "filetype": "application/pdf", "page_number": 5 }, - "text": "2 Including 28 firefighters that were exposed to lethal amounts of radiation during the accident night, and 15 fatal cases of thyroid cancer. 3 Sources drawn upon: Markandya, A., & Wilkinson, P. (2007), Sovacool et al. (2016). Data for nuclear accidents modified to reflect the" - }, - { - "type": "Title", - "element_id": "31138d5dc0c297144d27d5dbd15d5ef0", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 5 - }, - "text": "2012 UNSCEAR report and the 2015 US NRC SOARCA study." + "text": "2 Including 28 firefighters that were exposed to lethal amounts of radiation during the accident night, and 15 fatal cases of thyroid cancer. $ Sources drawn upon: Markandya, A., & Wilkinson, P. (2007), Sovacool et al. (2016). Data for nuclear accidents modified to reflect the 2012 UNSCEAR report and the 2015 US NRC SOARCA study." }, { "type": "UncategorizedText", @@ -2178,386 +2160,8 @@ "text": "i" }, { - "type": "NarrativeText", - "element_id": "e72fdf383c0b4d8cba0284d4f7ff06d5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries" - }, - { - "type": "Title", - "element_id": "5d7f49449ab22deac22d767b89549c55", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "ii" - }, - { - "type": "Title", - "element_id": "f5557d4fcf727a981a3c315aca733eef", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "iii" - }, - { - "type": "Title", - "element_id": "4c94485e0c21ae6c41ce1dfe7b6bface", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "v" - }, - { - "type": "Title", - "element_id": "c0ff93ea8927a7366db0331e5fd9d19f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "vi" - }, - { - "type": "NarrativeText", - "element_id": "9d45931b60fa1041a13243a1ee1bb170", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "xii BP, 2020. BP Statistical Review of World Energy, London: BP." - }, - { - "type": "NarrativeText", - "element_id": "794a96b3ab9a3e860f65549c3a106704", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "viii National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/" - }, - { - "type": "NarrativeText", - "element_id": "94178a8c2e84bf4b8f2eed9c79d7cfd5", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "ix Cancer Research UK (n.d.). Cancer risk statistics. Available at: https://www.cancerresearchuk.org/health-" - }, - { - "type": "NarrativeText", - "element_id": "4051afedda98549176dc28aaa9087e81", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "iv United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific" - }, - { - "type": "NarrativeText", - "element_id": "d85940c91ae6b53fc4b41bd5137e7371", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "xi World Health Organization (2018). Climate change and health. Available at: https://www.who.int/news-room/fact-" - }, - { - "type": "NarrativeText", - "element_id": "9a236889bced20048d1619798291d194", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "vii World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a" - }, - { - "type": "NarrativeText", - "element_id": "26a84724035df76d7d8a6610a6fa4627", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "x OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https://www.oecd-nea.org/jcms/pl_14998/" - }, - { - "type": "Title", - "element_id": "6e98dee26ce2439cd4b8af82426e894e", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "understanding/statistics" - }, - { - "type": "Title", - "element_id": "759772833f6756e511150b2a49233864", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "professional/cancer-statistics/risk" - }, - { - "type": "Title", - "element_id": "86c0a0cef7faa217f386f75ead17dbec", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "sheets/detail/climate-change-and-health" - }, - { - "type": "Title", - "element_id": "7267222b91f507e040c69dad9af7941f", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "the-full-costs-of-electricity-provision?details=true" - }, - { - "type": "NarrativeText", - "element_id": "2ef1e8614bc32af635d2a0c894b2ed3c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Slovic, P., 2010. The Psychology of risk. Saúde e Sociedade, 19(4), pp. 731-747." - }, - { - "type": "NarrativeText", - "element_id": "e4d7c811a799c3c8e706125556f8a370", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712" - }, - { - "type": "NarrativeText", - "element_id": "98e5f594de0e79990a0650489fdf295c", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "Committee on the Effects of Atomic Radiation. Accessed from: https://www.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf" - }, - { - "type": "NarrativeText", - "element_id": "d5658e2a49995a2f4ca4b45d95f2058b", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021]" - }, - { - "type": "NarrativeText", - "element_id": "c328c06c32c00c43471cd3c9d257c68b", - "metadata": { - "data_source": { - "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", - "version": 306475068461766865312866697521104206816, - "record_locator": { - "protocol": "s3", - "remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf" - }, - "date_modified": "2023-02-12T10:09:32" - }, - "filetype": "application/pdf", - "page_number": 10 - }, - "text": "International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018" - }, - { - "type": "NarrativeText", - "element_id": "6bbd046b939157389606adf4059fe1f3", + "type": "ListItem", + "element_id": "158d56841d65947a9a91a3ca34163a4c", "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf", @@ -2571,7 +2175,7 @@ "filetype": "application/pdf", "page_number": 10 }, - "text": "Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8" + "text": "Vi VIL xi xii World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https:/Awww.bbc.co.uk/news/ business-50953712 Slovic, P, 2010. The Psychology of risk. Sauide e Sociedade, 19(4), pp. 731-747. United Nations Scientific Committee on the Effects of Radiation (2016). Report of the United Nations Scientific Committee on the Effects of Atomic Radiation. Accessed from: https:/Avww.unscear.org/docs/publications/2016/ UNSCEAR_2016_GA-Report-CORR.pdf International Energy Agency (2020). Global share of total energy supply by source, 2018. Key World Energy Statistics 2020. Available at: https://www.iea.org/data-and-statistics/charts/global-share-of-total-energy-supply-by- source-2018 Vohra, K., Vodonos, A., Schwartz, J., Marais, E., Sulprizio, M., & Mickley, L. (2021). Global mortality from outdoor fine particle pollution generated by fossil fuel combustion: Results from GEOS-Chem. Environmental Research, 195, p. 1-8 World Health Organization. (2016). Updated tables 2016 for ‘Preventing disease through health environments: a global assessment of the burden of disease from environmental risks’. Available at: https://www.who.int/data/gho/ data/themes/public-health-and-environment [Accessed on 8 April 2021] National Cancer Institute (2020). Cancer statistics. Available at: https://www.cancer.gov/about-cancer/ understanding/statistics Cancer Research UK (n.d.). Cancer risk statistics. Available at: https:/Awww.cancerresearchuk.org/health- professional/cancer-statistics/risk OECD-NEA (2019). The Full Costs of Electricity Provision. Available at: https:/Avww.oecd-nea.org/jcms/pl_14998/ the-full-costs-of-electricity-provision?details=true World Health Organization (2018). Climate change and health. Available at: https:/Awww.who.int/news-room/fact- sheets/detail/climate-change-and-health BP 2020. BP Statistical Review of World Energy, London: BP" }, { "type": "NarrativeText", diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 3703d5d96a..2b9078795c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev0" # pragma: no cover +__version__ = "0.10.19-dev1" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index f1b03b63bd..4cfa6b044a 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -329,7 +329,11 @@ def _partition_pdf_or_image_local( ocr_languages = prepare_languages_for_tesseract(languages) - model_name = model_name if model_name else os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME") + model_name = ( + model_name + if model_name + else os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME", "detectron2_onnx") + ) pdf_image_dpi = kwargs.pop("pdf_image_dpi", None) extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False) image_output_dir_path = kwargs.get("image_output_dir_path", None)