diff --git a/.gitignore b/.gitignore
index 8cf3a69c14..f2dd1e35ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@ build/
develop-eggs/
dist/
downloads/
+figures/
eggs/
.eggs/
lib/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f7a0fa582..731619f4da 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.11.4-dev5
+## 0.11.4-dev6
### Enhancements
diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
index dc6f8416a6..fbab702565 100644
--- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
@@ -154,7 +154,7 @@
"type": "ListItem"
},
{
- "element_id": "ff686e6046cd8176988d8dec0d8adac4",
+ "element_id": "c2036e04407827dd9f895cce7fdb8674",
"metadata": {
"data_source": {
"date_created": "2023-03-10T09:32:44+00:00",
diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
index e8b064d503..7c9a9b3a82 100644
--- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
@@ -325,7 +325,7 @@
"type": "NarrativeText"
},
{
- "element_id": "82391aed75376c2c3bc734ad52ec73e4",
+ "element_id": "63ffdff8ce24056d9d776dda2adcc934",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -334,7 +334,7 @@
],
"page_number": 2
},
- "text": "How data were acquired The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225–230. Data format Experimental factors Experimental features Data source location Accessibility Related research article",
+ "text": "How data were acquired Data format Experimental factors Experimental features Data source location Accessibility Related research article The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225–230.",
"type": "Table"
},
{
@@ -429,7 +429,7 @@
"type": "NarrativeText"
},
{
- "element_id": "ba559032c2f9f98c24e4c547af135b8e",
+ "element_id": "3fcf35e8b67240541d3f2bf3bc0a39c5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -438,7 +438,7 @@
],
"page_number": 2
},
- "text": "30 10g 8g 6g 4g 2g Control ) g m ( s s o 20 l t h g e W i 10 48 96 144 192 ",
+ "text": "30 ) g m ( s s o l t h g e W i 20 10g 8g 6g 4g 2g Control 10 48 96 144 192 ",
"type": "Image"
},
{
@@ -481,7 +481,7 @@
"type": "NarrativeText"
},
{
- "element_id": "f2e384e79a4fbce052f262a93ec46102",
+ "element_id": "9fd8126152fe50909dc643c92ff6cd4c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -490,7 +490,7 @@
],
"page_number": 3
},
- "text": "2.7 1.8 10g 8g 6g 4g 2g Control 0.9 24 48 72 96 120 144 168 192 Exposure time ",
+ "text": "2.7 1.8 0.9 10g 8g 6g 4g 2g Control 24 48 72 96 120 144 168 192 Exposure time ",
"type": "Image"
},
{
@@ -585,7 +585,7 @@
"type": "UncategorizedText"
},
{
- "element_id": "00c6c21aa97f59dc84190f023eaaf769",
+ "element_id": "c1daf0ef9e2135894ab832147233a7f3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -594,7 +594,7 @@
],
"page_number": 3
},
- "text": "90 2g 4g 6g 8g 10g 80 ) % 70 ( y c n e c i f f 60 i 50 E n o i t i b h n I 40 i 30 20 10 0 20 40 60 80 100 120 140 160 180 ",
+ "text": "90 ) % ( y c n e c i f f i E n o i t i b h n I i 80 70 60 50 40 30 2g 4g 6g 8g 10g 20 10 0 20 40 60 80 100 120 140 160 180 ",
"type": "Image"
},
{
@@ -715,7 +715,7 @@
"type": "UncategorizedText"
},
{
- "element_id": "150b064badb909ac7549f8064cf2caba",
+ "element_id": "a31f676a690660014de4c38544212163",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -724,7 +724,7 @@
],
"page_number": 4
},
- "text": "icorr (A/cm2) Polarization resistance (Ω) Inhibitor concentration (g) bc (V/dec) ba (V/dec) Ecorr (V) (cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356 0 2 4 6 8 10 0.0335 1.9460 0.0163 0.3233 0.1240 0.0382 0.0409 0.0596 0.2369 0.0540 0.0556 0.0086 0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05 24.0910 121.440 42.121 373.180 305.650 246.080 2.8163 1.5054 0.9476 0.4318 0.3772 0.0919",
+ "text": "Inhibitor concentration (g) bc (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm2) Polarization resistance (Ω) 0 2 4 6 8 10 0.0335 1.9460 0.0163 0.3233 0.1240 0.0382 0.0409 0.0596 0.2369 0.0540 0.0556 0.0086 (cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356 0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05 24.0910 121.440 42.121 373.180 305.650 246.080 2.8163 1.5054 0.9476 0.4318 0.3772 0.0919",
"type": "Table"
},
{
@@ -1092,7 +1092,7 @@
"type": "Formula"
},
{
- "element_id": "aa63e6aba52eb53a896d01e2c7ccc133",
+ "element_id": "13bc6e646e1dea06c3836e074c7fe40f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -1101,7 +1101,7 @@
],
"page_number": 6
},
- "text": "IE ð%Þ ¼ CRo (cid:3) CR 100 1 x CRo",
+ "text": "IE ð%Þ ¼ CRo (cid:3) CR CRo x 100 1",
"type": "Formula"
},
{
diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
index 64dd752eb5..a9e3e4bf19 100644
--- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
@@ -416,7 +416,7 @@
"type": "UncategorizedText"
},
{
- "element_id": "4e5faed345ed29d23513a466e412ec0a",
+ "element_id": "d022b06e927bb8ee92ff9034e08e62de",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -425,7 +425,7 @@
],
"page_number": 2
},
- "text": "i , a start location, ls i , and an end location, le i , i , an end time, te and",
+ "text": "i , a start location, ls i , an end time, te i , and an end location, le i , and",
"type": "NarrativeText"
},
{
@@ -637,7 +637,7 @@
"type": "Title"
},
{
- "element_id": "ff667ddf988229560eaac54fc38ddc66",
+ "element_id": "cf21fea12c5e4fac7b8606af479c6edf",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@@ -646,7 +646,7 @@
],
"page_number": 4
},
- "text": "Number of lines Number of columns in each line Description 1 1 n 3 m 4 The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j. i , the start i , the end location le l l",
+ "text": "Number of lines Number of columns in each line Description 1 1 n l 3 m 4 l The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j. i , the start i , the end location le",
"type": "Table"
},
{
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
index 44ccf50556..1edfd1b60e 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@@ -760,7 +760,7 @@
"type": "FigureCaption"
},
{
- "element_id": "c57f2166778009c6ccc9032ee8883253",
+ "element_id": "4577aebb24ff9c0fd98387936d5ef4a7",
"metadata": {
"data_source": {
"permissions_data": [
@@ -777,7 +777,7 @@
"page_number": 5,
"text_as_html": "
Dataset | | Base Model'| | | Notes | PubLayNet B8]| | F/M | Layouts of modern scientific documents |
PRImA | M | Layouts of scanned modern magazines and scientific report |
Newspaper | F | Layouts of scanned US newspapers from the 20th century |
TableBank | F | Table region on modern scientific and business document |
HJDataset | F/M | Layouts of history Japanese documents |
"
},
- "text": "Base Model1 Large Model Notes Dataset PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents",
+ "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents",
"type": "Table"
},
{
@@ -1261,7 +1261,7 @@
"type": "NarrativeText"
},
{
- "element_id": "b35cc086edca679ccc52fa6701857549",
+ "element_id": "35adc4ddaef6ffb21b754c9c350c856f",
"metadata": {
"data_source": {
"permissions_data": [
@@ -1278,7 +1278,7 @@
"page_number": 8,
"text_as_html": "block.pad(top, bottom, | right, | left) | Enlarge the current block according to the input | block.scale(fx, fy) | | | Scale the current block given the ratio in x and y direction |
block.shift(dx, dy) | | | Move the current block with the shift distances in x and y direction |
block1.is_in(block2) | | | Whether block] is inside of block2 |
block1. intersect (block2) | | | Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs |
block1.union(block2) | | | Return the union region of blockl and block2. Coordinate type to be determined based on the inputs |
block1.relative_to(block2) | | | Convert the absolute coordinates of block to relative coordinates to block2 |
block1.condition_on(block2) | | | Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates |
block. crop_image (image) | | | Obtain the image segments in the block region |
"
},
- "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input Scale the current block given the ratio in x and y direction block.scale(fx, fy) Move the current block with the shift distances in x and y direction block.shift(dx, dy) Whether block1 is inside of block2 block1.is in(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.intersect(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.relative to(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block1.condition on(block2) Obtain the image segments in the block region block.crop image(image)",
+ "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block.crop image(image) Obtain the image segments in the block region",
"type": "Table"
},
{
diff --git a/test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.pdf.json
index a4b9bd97be..534bf35ca5 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/2023-Jan-economic-outlook.pdf.json
@@ -1197,7 +1197,7 @@
"type": "Title"
},
{
- "element_id": "429d7ccdab398bfb2107fa00f9054272",
+ "element_id": "8174b87b76dfe8e8ddb31ab83abc6c33",
"metadata": {
"data_source": {
"date_modified": "2023-02-14T07:31:28",
@@ -1214,7 +1214,7 @@
],
"page_number": 7
},
- "text": "WEO Projections 1/ Estimate 2022 Projections 2023 Estimate 2022 Projections 2023 2021 2024 2023 2024 2024 6.2 3.4 2.9 3.1 0.2 –0.1 1.9 3.2 3.0 Advanced Economies United States Euro Area 5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3 2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8 1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0 1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4 0.1 0.4 0.2 0.4 0.0 0.8 –0.1 0.2 –0.9 0.0 –0.3 –0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2 1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4 1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 –0.5 1.2 2.1 1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2 Germany France Italy Spain Japan United Kingdom Canada Other Advanced Economies 3/ Emerging Market and Developing Economies Emerging and Developing Asia 6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9 3.9 4.3 3.0 6.8 0.7 –2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6 4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2 4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3 0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 –0.4 –1.1 0.1 0.2 0.1 –0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0 2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0 5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5 4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8 China India 4/ Emerging and Developing Europe Russia Latin America and the Caribbean Brazil Mexico Middle East and Central Asia Saudi Arabia Sub-Saharan Africa Nigeria South Africa Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries 6.0 5.5 3.8 4.1 7.0 4.1 3.1 3.7 5.2 5.4 3.8 4.9 2.4 0.7 4.3 3.2 4.0 4.9 2.5 1.8 4.7 3.5 4.1 5.6 0.3 0.0 –0.2 –0.4 0.4 0.0 –0.1 –0.3 –0.2 0.2 0.0 0.1 1.7 1.8 3.7 . . . 2.5 . . . 2.5 1.2 5.7 . . . 5.0 . . . 2.5 2.0 4.0 . . . 4.1 . . . 10.4 9.4 12.1 5.4 6.6 3.4 2.4 2.3 2.6 3.4 2.7 4.6 –0.1 0.0 –0.3 –0.3 –0.4 0.0 . . . . . . . . . . . . . . . . . . . . . . . . . . . 65.8 26.4 39.8 7.0 –16.2 –6.3 –7.1 –0.4 –3.3 –0.1 –0.9 0.3 11.2 –2.0 –9.8 1.4 –5.9 –0.2",
+ "text": "2021 Estimate 2022 Projections 2023 2024 WEO Projections 1/ 2023 2024 Estimate 2022 Projections 2023 2024 6.2 3.4 2.9 3.1 0.2 –0.1 1.9 3.2 3.0 Advanced Economies United States Euro Area Germany France Italy Spain Japan United Kingdom Canada Other Advanced Economies 3/ 5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3 2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8 1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0 1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4 0.1 0.4 0.2 0.4 0.0 0.8 –0.1 0.2 –0.9 0.0 –0.3 –0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2 1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4 1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 –0.5 1.2 2.1 1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2 Emerging Market and Developing Economies Emerging and Developing Asia China India 4/ Emerging and Developing Europe Russia Latin America and the Caribbean Brazil Mexico Middle East and Central Asia Saudi Arabia Sub-Saharan Africa Nigeria South Africa 6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9 3.9 4.3 3.0 6.8 0.7 –2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6 4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2 4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3 0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 –0.4 –1.1 0.1 0.2 0.1 –0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0 2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0 5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5 4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8 Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries 6.0 5.5 3.8 4.1 7.0 4.1 3.1 3.7 5.2 5.4 3.8 4.9 2.4 0.7 4.3 3.2 4.0 4.9 2.5 1.8 4.7 3.5 4.1 5.6 0.3 0.0 –0.2 –0.4 0.4 0.0 –0.1 –0.3 –0.2 0.2 0.0 0.1 1.7 1.8 3.7 . . . 2.5 . . . 2.5 1.2 5.7 . . . 5.0 . . . 2.5 2.0 4.0 . . . 4.1 . . . 10.4 9.4 12.1 5.4 6.6 3.4 2.4 2.3 2.6 3.4 2.7 4.6 –0.1 0.0 –0.3 –0.3 –0.4 0.0 . . . . . . . . . . . . . . . . . . . . . . . . . . . 65.8 26.4 39.8 7.0 –16.2 –6.3 –7.1 –0.4 –3.3 –0.1 –0.9 0.3 11.2 –2.0 –9.8 1.4 –5.9 –0.2",
"type": "Table"
},
{
@@ -2310,7 +2310,7 @@
"type": "NarrativeText"
},
{
- "element_id": "5ad3e97ac0a2d759e059893765b81954",
+ "element_id": "d750b11efc2f858c7deadb09e3929e1c",
"metadata": {
"data_source": {
"date_modified": "2023-02-14T07:31:28",
@@ -2327,7 +2327,7 @@
],
"page_number": 11
},
- "text": "United States Euro area China Other AEs Other EMs 7 October 2022 GFSR 6 5 4 3 2 1 0 –1 –2 –3 2006 08 08 10 10 12 12 14 16 14 18 18 20 22 22 06 16 20 ",
+ "text": "7 6 5 4 United States Euro area China Other AEs Other EMs October 2022 GFSR 3 2 1 0 –1 –2 –3 2006 08 08 06 10 10 12 12 14 16 14 16 18 18 20 22 22 20 ",
"type": "Image"
},
{
@@ -2394,7 +2394,7 @@
"type": "Title"
},
{
- "element_id": "5728dbbab19d146278a6a3387e8e40d5",
+ "element_id": "6215d8f373972db90d05458d63af9efe",
"metadata": {
"data_source": {
"date_modified": "2023-02-14T07:31:28",
@@ -2411,7 +2411,7 @@
],
"page_number": 11
},
- "text": "Latest October 2022 GFSR 5 6 2. Euro area 1. United States 5 4 4 3 3 2 2 1 1 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 ",
+ "text": "Latest October 2022 GFSR 6 1. United States 2. Euro area 5 4 3 2 5 4 3 2 1 1 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 ",
"type": "Image"
},
{
diff --git a/test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).pdf.json b/test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).pdf.json
index 3d3135ef91..558652deae 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/Silent-Giant-(1).pdf.json
@@ -441,7 +441,7 @@
"type": "UncategorizedText"
},
{
- "element_id": "27a3cde643219ef7662f032684e06bd4",
+ "element_id": "d5e389eb1b6b367ac5cf6e12acccfcbc",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:10:36",
@@ -458,7 +458,7 @@
],
"page_number": 4
},
- "text": " Marine CSP 40,000 Solar PV 35,000 Geothermal 30,000 Wind Bioenergy 25,000 Hydro 20,000 Nuclear 15,000 Gas 10,000 Oil Coal 5,000 0 ",
+ "text": " Marine 40,000 CSP 35,000 Solar PV Geothermal 30,000 Wind 25,000 Bioenergy 20,000 Hydro Nuclear 15,000 Gas 10,000 Oil 5,000 Coal 0 ",
"type": "Image"
},
{
@@ -1071,7 +1071,7 @@
"type": "Title"
},
{
- "element_id": "196f551acb55f0373a9d7fac6c9dbeab",
+ "element_id": "2a5e6485f55769e5d4c820cb79f018d7",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:10:36",
@@ -1088,7 +1088,7 @@
],
"page_number": 7
},
- "text": "300 250 200 150 100 50 0 O nshore Wind Offshore Wind N uclear m ercial Photovoltaic C oal C C G T C o m ",
+ "text": "300 250 200 150 100 50 0 m ercial Photovoltaic C o m O nshore Wind Offshore Wind N uclear C C G T C oal ",
"type": "Image"
},
{
@@ -1281,7 +1281,7 @@
"type": "UncategorizedText"
},
{
- "element_id": "577fd212dac38df299e478d6b7ce5d74",
+ "element_id": "2d52b4cb071eb1384e8f64581d907335",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:10:36",
@@ -1298,7 +1298,7 @@
],
"page_number": 8
},
- "text": "120 120 99.5 100 71.9 80 60 40 20 8.5 1.78 0.245 <0.01 0 Offshore wind O nshore wind (G erm any) C oal Oil N atural gas N uclear* S olar P V (U K) ",
+ "text": "120 100 120 99.5 80 60 71.9 40 20 0 C oal Oil N atural gas 8.5 1.78 Offshore wind O nshore wind (G erm any) (U K) 0.245 S olar P V <0.01 N uclear* ",
"type": "Image"
},
{
@@ -1533,7 +1533,7 @@
"type": "UncategorizedText"
},
{
- "element_id": "1459b67becac6e70efecfcbc9312d3f0",
+ "element_id": "67ff7489d537e35454934b9dc3a725f9",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:10:36",
@@ -1550,7 +1550,7 @@
],
"page_number": 8
},
- "text": " Coal 90 Gas/Oil 80 Biofuels/Waste Wind/Solar 70 Hydro 60 Nuclear 50 40 30 20 10 ",
+ "text": "90 Coal Gas/Oil 80 Biofuels/Waste 70 Wind/Solar 60 Hydro Nuclear 50 40 30 20 10 ",
"type": "Image"
},
{
@@ -1659,7 +1659,7 @@
"type": "FigureCaption"
},
{
- "element_id": "ff963f0df99d82f7c343649121217117",
+ "element_id": "dddeec4eec1ff6db9e832ed00fea1b7e",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:10:36",
@@ -1676,7 +1676,7 @@
],
"page_number": 9
},
- "text": "600 Non-hydro 500 ren. & waste Nuclear 400 Natural gas 300 Hydro Oil 200 Coal 100 0 ",
+ "text": "600 500 Non-hydro ren. & waste 400 Nuclear Natural gas 300 Hydro 200 Oil Coal 100 0 ",
"type": "Image"
},
{
diff --git a/test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json
index 2ab27c2508..0fc20e972a 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json
@@ -336,7 +336,7 @@
"type": "NarrativeText"
},
{
- "element_id": "9405da801e46d0da5f19ea801ff4ff51",
+ "element_id": "92a15f52537ead259f4d9c2da1b22454",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:09:32",
@@ -353,7 +353,7 @@
],
"page_number": 4
},
- "text": "Experts 1 Nuclear power 20 2 Motor vehicles 1 3 Handguns 4 4 Smoking 2 17 Electric power (non-nuclear) 9 22 X-rays 7 30 Vaccinations 25",
+ "text": "Experts 1 20 Nuclear power Motor vehicles 2 1 4 3 Handguns 2 4 Smoking Electric power (non-nuclear) 9 17 22 7 X-rays 25 30 Vaccinations",
"type": "Table"
},
{
@@ -630,7 +630,7 @@
"type": "UncategorizedText"
},
{
- "element_id": "0bcb3759fa68b68d784c3c3963253c90",
+ "element_id": "44f0d817d4311d9da996b2cb20dc80c8",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:09:32",
@@ -647,7 +647,7 @@
],
"page_number": 5
},
- "text": "25 20 18.4 15 10 4.6 5 2.8 0.07 0.04 0.02 0.01 0 C oal Oil Bio m ass N atural gas Wind H ydropo w er S olar N uclear ",
+ "text": "25 20 18.4 15 10 5 4.6 2.8 0 C oal Oil Bio m ass N atural gas 0.07 Wind 0.04 H ydropo w er 0.02 S olar 0.01 N uclear ",
"type": "Image"
},
{
@@ -987,7 +987,7 @@
"type": "NarrativeText"
},
{
- "element_id": "73ffa3745f99b6332d0ddfac674755c6",
+ "element_id": "79de44b69099529ba9f79b31427cad59",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:09:32",
@@ -1004,7 +1004,7 @@
],
"page_number": 7
},
- "text": "Social and environmental costs of emissions, land-use, climate change, security of supply, etc. Plant-level production costs at market prices Grid-level costs of the electricity system ",
+ "text": "Plant-level production costs at market prices Grid-level costs of the electricity system Social and environmental costs of emissions, land-use, climate change, security of supply, etc. ",
"type": "Image"
},
{
diff --git a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
index 8865cec2cf..6d98818c49 100644
--- a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
+++ b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv
@@ -1,3 +1,3 @@
metric average sample_sd population_sd count
-cct-accuracy 0.803 0.249 0.241 16
-cct-%missing 0.024 0.033 0.032 16
+cct-accuracy 0.803 0.248 0.241 16
+cct-%missing 0.025 0.033 0.032 16
diff --git a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
index 9858ebfddb..053253fb8d 100644
--- a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
+++ b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv
@@ -11,7 +11,7 @@ ideas-page.html html local 0.929 0.033
UDHR_first_article_all.txt txt local-single-file 0.995 0.0
fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
-layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.945 0.029
-2023-Jan-economic-outlook.pdf pdf s3 0.846 0.039
+layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.949 0.029
+2023-Jan-economic-outlook.pdf pdf s3 0.845 0.039
page-with-formula.pdf pdf s3 0.971 0.021
-recalibrating-risk-report.pdf pdf s3 0.973 0.007
+recalibrating-risk-report.pdf pdf s3 0.968 0.008
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 524d8978e5..baac6ca3d2 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.4-dev5" # pragma: no cover
+__version__ = "0.11.4-dev6" # pragma: no cover
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
index 523cd62be1..1cbbf6c0c4 100644
--- a/unstructured/partition/pdf_image/pdfminer_processing.py
+++ b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -18,6 +18,7 @@
rect_to_bbox,
)
from unstructured.partition.utils.constants import Source
+from unstructured.partition.utils.sorting import sort_text_regions
if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout
@@ -95,7 +96,7 @@ def get_regions_by_pdfminer(
layouts = []
# Coefficient to rescale bounding box to be compatible with images
coef = dpi / 72
- for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
+ for page, page_layout in open_pdfminer_pages_generator(fp):
height = page_layout.height
layout: List["TextRegion"] = []
@@ -125,7 +126,13 @@ def get_regions_by_pdfminer(
if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)
+ # NOTE(christine): always do the basic sort first for deterministic order across
+ # python versions.
layout = order_layout(layout)
+
+ # apply the current default sorting to the layout elements extracted by pdfminer
+ layout = sort_text_regions(layout)
+
layouts.append(layout)
return layouts
diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py
index 5b98919c82..b72bf7f1ff 100644
--- a/unstructured/partition/utils/sorting.py
+++ b/unstructured/partition/utils/sorting.py
@@ -1,5 +1,5 @@
import os
-from typing import List, Tuple
+from typing import TYPE_CHECKING, Any, List, Tuple
import numpy as np
@@ -8,6 +8,9 @@
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
from unstructured.partition.utils.xycut import recursive_xy_cut, recursive_xy_cut_swapped
+if TYPE_CHECKING:
+ from unstructured_inference.inference.elements import TextRegion
+
def coordinates_to_bbox(coordinates: CoordinatesMetadata) -> Tuple[int, int, int, int]:
"""
@@ -73,6 +76,24 @@ def coord_has_valid_points(coordinates: CoordinatesMetadata) -> bool:
return True
+def bbox_is_valid(bbox: Any) -> bool:
+ """
+ Verifies all 4 values in a bounding box exist and are positive.
+ """
+
+ if not bbox:
+ return False
+ if len(bbox) != 4:
+ return False
+ for v in bbox:
+ try:
+ if v < 0:
+ return False
+ except TypeError:
+ return False
+ return True
+
+
def sort_page_elements(
page_elements: List[Element],
sort_mode: str = SORT_MODE_XY_CUT,
@@ -163,3 +184,82 @@ def _coords_ok(strict_points: bool):
sorted_page_elements = page_elements
return sorted_page_elements
+
+
+def sort_bboxes_by_xy_cut(
+ bboxes,
+ shrink_factor: float = 0.9,
+ xy_cut_primary_direction: str = "x",
+):
+ """Sort bounding boxes using XY-cut algorithm."""
+
+ shrunken_bboxes = []
+ for bbox in bboxes:
+ shrunken_bbox = shrink_bbox(bbox, shrink_factor)
+ shrunken_bboxes.append(shrunken_bbox)
+
+ res: List[int] = []
+ xy_cut_sorting_func = (
+ recursive_xy_cut_swapped if xy_cut_primary_direction == "x" else recursive_xy_cut
+ )
+ xy_cut_sorting_func(
+ np.asarray(shrunken_bboxes).astype(int),
+ np.arange(len(shrunken_bboxes)),
+ res,
+ )
+ return res
+
+
+def sort_text_regions(
+ elements: List["TextRegion"],
+ sort_mode: str = SORT_MODE_XY_CUT,
+ shrink_factor: float = 0.9,
+ xy_cut_primary_direction: str = "x",
+) -> List["TextRegion"]:
+ """Sort a list of TextRegion elements based on the specified sorting mode."""
+
+ if not elements:
+ return elements
+
+ bboxes = [(el.bbox.x1, el.bbox.y1, el.bbox.x2, el.bbox.y2) for el in elements]
+
+ def _bboxes_ok(strict_points: bool):
+ warned = False
+
+ for bbox in bboxes:
+ if bbox is None:
+ trace_logger.detail( # type: ignore
+ "some or all elements are missing bboxes, skipping sort",
+ )
+ return False
+ elif not bbox_is_valid(bbox):
+ if not warned:
+ trace_logger.detail(f"bbox {bbox} does not have valid values") # type: ignore
+ warned = True
+ if strict_points:
+ return False
+ return True
+
+ if sort_mode == SORT_MODE_XY_CUT:
+ if not _bboxes_ok(strict_points=True):
+ return elements
+
+ shrink_factor = float(
+ os.environ.get("UNSTRUCTURED_XY_CUT_BBOX_SHRINK_FACTOR", shrink_factor),
+ )
+
+ xy_cut_primary_direction = os.environ.get(
+ "UNSTRUCTURED_XY_CUT_PRIMARY_DIRECTION",
+ xy_cut_primary_direction,
+ )
+
+ res = sort_bboxes_by_xy_cut(
+ bboxes=bboxes,
+ shrink_factor=shrink_factor,
+ xy_cut_primary_direction=xy_cut_primary_direction,
+ )
+ sorted_elements = [elements[i] for i in res]
+ else:
+ sorted_elements = elements
+
+ return sorted_elements