From 71d51805df636f50550201f44ee7e8f1ceefc190 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 2 Jul 2024 17:12:22 -0500 Subject: [PATCH 1/6] feat: use weighted mean for table metrics --- unstructured/metrics/evaluate.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index 4ea44a237a..b440a0cce7 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -11,6 +11,7 @@ from pathlib import Path from typing import List, Optional, Union +import numpy as np import pandas as pd from tqdm import tqdm @@ -267,7 +268,14 @@ def _generate_dataframes(self, rows): element_metrics_results = {} for metric in combined_table_metrics: metric_df = has_tables_df[has_tables_df[metric].notnull()] - agg_metric = metric_df[metric].agg([_mean, _stdev, _pstdev, _count]).transpose() + agg_metric = metric_df[metric].agg([_stdev, _pstdev, _count]).transpose() + if metric == "total_tables": + agg_metric["_mean"] = metric_df[metric].mean() + else: + agg_metric["_mean"] = np.average( + metric_df[metric], weights=metric_df["total_tables"] + ) + print(agg_metric) if agg_metric.empty: element_metrics_results[metric] = pd.Series( data=[None, None, None, 0], index=["_mean", "_stdev", "_pstdev", "_count"] From 8f689bfa189e83808c35ee186a7c68832e4cbbe4 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 2 Jul 2024 18:01:01 -0500 Subject: [PATCH 2/6] feat: round numbers to 3 decimal points --- unstructured/metrics/evaluate.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index b440a0cce7..f5bb5a8131 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -47,6 +47,13 @@ logger.setLevel(logging.DEBUG) AGG_HEADERS = ["metric", "average", "sample_sd", "population_sd", "count"] +AGG_HEADERS_MAPPING = { + "index": "metric", + "_mean": "average", + "_stdev": "sample_sd", + "_pstdev": "population_sd", + "_count": "count", +} OUTPUT_TYPE_OPTIONS = ["json", "txt"] @@ -272,10 +279,10 @@ def _generate_dataframes(self, rows): if metric == "total_tables": agg_metric["_mean"] = metric_df[metric].mean() else: - agg_metric["_mean"] = np.average( - metric_df[metric], weights=metric_df["total_tables"] + agg_metric["_mean"] = np.round( + np.average(metric_df[metric], weights=metric_df["total_tables"]), + 3, ) - print(agg_metric) if agg_metric.empty: element_metrics_results[metric] = pd.Series( data=[None, None, None, 0], index=["_mean", "_stdev", "_pstdev", "_count"] @@ -283,7 +290,7 @@ def _generate_dataframes(self, rows): else: element_metrics_results[metric] = agg_metric agg_df = pd.DataFrame(element_metrics_results).transpose().reset_index() - agg_df.columns = AGG_HEADERS + agg_df = agg_df.rename(columns=AGG_HEADERS_MAPPING) return df, agg_df From 3a59e5379cb195c330d20d1b979253acb572a795 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 2 Jul 2024 18:03:57 -0500 Subject: [PATCH 3/6] chore: update changelog and bump version number --- CHANGELOG.md | 10 ++++++++++ unstructured/__version__.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee26af6f22..3beb81580b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.14.10-dev0 + +### Enhancements + +* **Use (number of actual table) weighted average for table metrics** In evaluating table metrics the mean aggregation now uses the actual number of tables in a document to weight the metric scores + +### Features + +### Fixes + ## 0.14.9 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9a5de316e5..abe782aca5 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.9" # pragma: no cover +__version__ = "0.14.10-dev0" # pragma: no cover From 017759249bbc152607c6003776836eebb3c99e41 Mon Sep 17 00:00:00 2001 From: Yao You Date: Wed, 3 Jul 2024 11:55:27 -0500 Subject: [PATCH 4/6] test: add test to weighted average --- .../2022-financial-statements-p11.pdf.json | 812 ++++++++++ .../2022-financial-statements-p11.pdf.json | 1416 +++++++++++++++++ test_unstructured/metrics/test_evaluate.py | 11 +- 3 files changed, 2237 insertions(+), 2 deletions(-) create mode 100644 example-docs/test_evaluate_files/gold_standard_table_structure/2022-financial-statements-p11.pdf.json create mode 100644 example-docs/test_evaluate_files/unstructured_output_table_structure/2022-financial-statements-p11.pdf.json diff --git a/example-docs/test_evaluate_files/gold_standard_table_structure/2022-financial-statements-p11.pdf.json b/example-docs/test_evaluate_files/gold_standard_table_structure/2022-financial-statements-p11.pdf.json new file mode 100644 index 0000000000..59f619cf33 --- /dev/null +++ b/example-docs/test_evaluate_files/gold_standard_table_structure/2022-financial-statements-p11.pdf.json @@ -0,0 +1,812 @@ +[ + { + "type": "Header", + "text": "I. General Department" + }, + { + "type": "Title", + "text": 1 + }, + { + "type": "Table", + "text": [ + { + "id": "66f5f15d-273f-43c3-9b51-ec6d28637e12", + "x": 0, + "y": 0, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "34f5f20a-d2d3-48ed-9c3a-416bca0ff517", + "x": 0, + "y": 1, + "w": 1, + "h": 1, + "content": "Assets" + }, + { + "id": "2330a22c-58d5-4c14-8dcc-7463b1b519f3", + "x": 0, + "y": 2, + "w": 1, + "h": 1, + "content": "Usable currencies" + }, + { + "id": "c9e62a61-33da-4cf3-a3f7-50e779e432ae", + "x": 0, + "y": 3, + "w": 1, + "h": 1, + "content": "Other currencies" + }, + { + "id": "9bd02245-2cff-4d72-ac3c-d14bb9f3e240", + "x": 0, + "y": 4, + "w": 1, + "h": 1, + "content": "Credit outstanding" + }, + { + "id": "3756106b-7b23-48d2-ac7d-af19fc25ff92", + "x": 0, + "y": 5, + "w": 1, + "h": 1, + "content": "Total currencies" + }, + { + "id": "eff641b2-b568-4492-9e0f-6af2a33fc107", + "x": 0, + "y": 6, + "w": 1, + "h": 1, + "content": "SDR holdings" + }, + { + "id": "00601fec-6ed4-401c-bf23-40597a6173bd", + "x": 0, + "y": 7, + "w": 1, + "h": 1, + "content": "Investments" + }, + { + "id": "7a057d2c-a8ad-438e-9e10-a49e7194147a", + "x": 0, + "y": 8, + "w": 1, + "h": 1, + "content": "Gold holdings" + }, + { + "id": "d4c05f57-ff6d-4d02-a23a-cfc3fb78c3fc", + "x": 0, + "y": 9, + "w": 1, + "h": 1, + "content": "Property, plant and equipment and intangible assets" + }, + { + "id": "3c99613d-47c7-468c-9745-84fedcddd33c", + "x": 0, + "y": 10, + "w": 1, + "h": 1, + "content": "Net assets under retirement benefit plans" + }, + { + "id": "9d1b1597-cc83-4b14-b19b-911418f6b7c7", + "x": 0, + "y": 11, + "w": 1, + "h": 1, + "content": "Other assets" + }, + { + "id": "fcad018e-53b5-43b9-b2ec-1a25dd38427b", + "x": 0, + "y": 12, + "w": 1, + "h": 1, + "content": "Total assets" + }, + { + "id": "608b3a56-db63-439a-a842-883a8ef3563c", + "x": 0, + "y": 13, + "w": 1, + "h": 1, + "content": "Liabilities" + }, + { + "id": "a98f2cee-0af4-426b-8990-dd2367721b1f", + "x": 0, + "y": 14, + "w": 1, + "h": 1, + "content": "Special Contingent Account" + }, + { + "id": "dac6f09f-8d7f-468c-9c58-e8e9cb472322", + "x": 0, + "y": 15, + "w": 1, + "h": 1, + "content": "Borrowings" + }, + { + "id": "20287888-b7ea-44c0-bd5f-402e32fad446", + "x": 0, + "y": 16, + "w": 1, + "h": 1, + "content": "Quota subscriptions" + }, + { + "id": "5bdec1a0-8cb2-4399-b078-dccecd64cca0", + "x": 0, + "y": 17, + "w": 1, + "h": 1, + "content": "Net liabilities under retirement benefit plans" + }, + { + "id": "6a8839cd-8554-4aad-813f-a51add864538", + "x": 0, + "y": 18, + "w": 1, + "h": 1, + "content": "Other liabilities" + }, + { + "id": "f6c3100d-6b1d-4efa-8bdb-862da646f037", + "x": 0, + "y": 19, + "w": 1, + "h": 1, + "content": "Total liabilities" + }, + { + "id": "cc43bc34-b7bf-47e2-9036-cd51339f21a8", + "x": 0, + "y": 20, + "w": 1, + "h": 1, + "content": "Reserves of the General Resources Account" + }, + { + "id": "b2d8455c-4a8a-46fc-b22b-8f6da9d19237", + "x": 0, + "y": 21, + "w": 1, + "h": 1, + "content": "Retained earnings of the Investment Account" + }, + { + "id": "faf36e7c-34ff-4725-a1e4-7ed5c923d1a4", + "x": 0, + "y": 22, + "w": 1, + "h": 1, + "content": "Resources of the Special Disbursement Account" + }, + { + "id": "e13ca441-7494-4e72-82c7-235147b02530", + "x": 0, + "y": 23, + "w": 1, + "h": 1, + "content": "Total liabilities, reserves, retained earnings, and resources" + }, + { + "id": "1ad7df6d-9f31-4f45-8090-769546dd0a65", + "x": 1, + "y": 0, + "w": 1, + "h": 1, + "content": "Note" + }, + { + "id": "2501d35a-f1b5-457a-97cc-31fc903b835f", + "x": 1, + "y": 1, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "474f2539-07b1-4fbd-be3c-1e81c80d66a5", + "x": 1, + "y": 2, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "b712f0ec-4c64-49c3-919b-57b87d612450", + "x": 1, + "y": 3, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "68fac5df-08fd-44ad-afc2-ea4d83b2a5d4", + "x": 1, + "y": 4, + "w": 1, + "h": 1, + "content": "5" + }, + { + "id": "0c8e5e2a-868e-470d-b95e-b4af1d2b106e", + "x": 1, + "y": 5, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "b01c4ad4-be06-4e17-b62a-b654dfb703dc", + "x": 1, + "y": 6, + "w": 1, + "h": 1, + "content": "6" + }, + { + "id": "a4d8eaca-b046-4dd8-80af-03fea8e3e22d", + "x": 1, + "y": 7, + "w": 1, + "h": 1, + "content": "7" + }, + { + "id": "aa674388-765b-4380-b902-07b25dc071a3", + "x": 1, + "y": 8, + "w": 1, + "h": 1, + "content": "9" + }, + { + "id": "40524dab-cb00-4b3a-ad1c-e8b084ca2f02", + "x": 1, + "y": 9, + "w": 1, + "h": 1, + "content": "10" + }, + { + "id": "51fd8888-c373-47b0-aee0-8cbb435f4e80", + "x": 1, + "y": 10, + "w": 1, + "h": 1, + "content": "11" + }, + { + "id": "8025c648-d9f2-46e2-b297-b47a8e87be02", + "x": 1, + "y": 11, + "w": 1, + "h": 1, + "content": "12" + }, + { + "id": "913fd95f-50fa-4051-b0cc-f4fda99ca94d", + "x": 1, + "y": 12, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "17894253-6c15-4bfb-8044-688b48121d6d", + "x": 1, + "y": 13, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "2985e339-b559-43de-b61e-15e2c44f2261", + "x": 1, + "y": 14, + "w": 1, + "h": 1, + "content": "13" + }, + { + "id": "32573e9c-98de-4fda-a07d-f4a733bc09ca", + "x": 1, + "y": 15, + "w": 1, + "h": 1, + "content": "14" + }, + { + "id": "174f56b1-6579-4dce-bb41-54697ad6a672", + "x": 1, + "y": 16, + "w": 1, + "h": 1, + "content": "15" + }, + { + "id": "aed9448b-5d3a-49d1-98f5-a25b219879e3", + "x": 1, + "y": 17, + "w": 1, + "h": 1, + "content": "11" + }, + { + "id": "79806387-c606-4e3b-a1c7-14d1df1671fb", + "x": 1, + "y": 18, + "w": 1, + "h": 1, + "content": "12" + }, + { + "id": "72307eaf-9cfd-4075-97d9-76dab90c2469", + "x": 1, + "y": 19, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "772534a0-3ef9-43a2-ab60-2e18dd0859ec", + "x": 1, + "y": 20, + "w": 1, + "h": 1, + "content": "16" + }, + { + "id": "872339e5-8690-4be2-9e96-ce9e7c385eb7", + "x": 1, + "y": 21, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "f83024d7-8eba-4b72-a1ee-8654a63a4dc8", + "x": 1, + "y": 22, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "dc0df0e2-1383-4c2c-86e8-3bdfb747969c", + "x": 1, + "y": 23, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "156eeaae-e606-424b-9918-33e8a4b4edc7", + "x": 2, + "y": 0, + "w": 1, + "h": 1, + "content": "2022" + }, + { + "id": "d8d77e89-470d-4554-9835-e04d7b2dc42c", + "x": 2, + "y": 1, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "5f2283d0-c3eb-4586-93c0-2da0eee67fff", + "x": 2, + "y": 2, + "w": 1, + "h": 1, + "content": "292,280" + }, + { + "id": "e263efe7-9c83-4422-8760-d48738724b58", + "x": 2, + "y": 3, + "w": 1, + "h": 1, + "content": "69,407" + }, + { + "id": "7c30f9c7-677f-455c-8d64-8588a976306e", + "x": 2, + "y": 4, + "w": 1, + "h": 1, + "content": "93,031" + }, + { + "id": "790d6a30-7dee-4a88-87ab-f906440df5be", + "x": 2, + "y": 5, + "w": 1, + "h": 1, + "content": "454,718" + }, + { + "id": "c6919305-bbae-40b2-aa61-9c30fb737cf3", + "x": 2, + "y": 6, + "w": 1, + "h": 1, + "content": "22,270" + }, + { + "id": "2bbf179e-21c9-4464-a9bf-1a06e7b5f1d5", + "x": 2, + "y": 7, + "w": 1, + "h": 1, + "content": "25,418" + }, + { + "id": "6fd8d460-bc52-4843-a37a-760bc89f90aa", + "x": 2, + "y": 8, + "w": 1, + "h": 1, + "content": "3,167" + }, + { + "id": "f7dc815c-9d78-45b8-9f11-23c7ec5edf94", + "x": 2, + "y": 9, + "w": 1, + "h": 1, + "content": "551" + }, + { + "id": "91737fe0-b342-4a63-a423-9187156396c2", + "x": 2, + "y": 10, + "w": 1, + "h": 1, + "content": "1,375" + }, + { + "id": "336b3b67-3bc2-4df0-b9e0-9bcd3ed8f51f", + "x": 2, + "y": 11, + "w": 1, + "h": 1, + "content": "911" + }, + { + "id": "a91b131d-27b3-4580-8829-5ef74fd4c83b", + "x": 2, + "y": 12, + "w": 1, + "h": 1, + "content": "508,410" + }, + { + "id": "f5412732-1008-4272-aab5-8bcc9c2bbf42", + "x": 2, + "y": 13, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "7f69417a-5100-4698-98cf-00c19e7c20d9", + "x": 2, + "y": 14, + "w": 1, + "h": 1, + "content": "\u2014" + }, + { + "id": "30b95999-7ab0-4534-aa1a-27a88a72e023", + "x": 2, + "y": 15, + "w": 1, + "h": 1, + "content": "2,615" + }, + { + "id": "cc53c5a2-a8fe-4e94-b4bd-ba630c1da521", + "x": 2, + "y": 16, + "w": 1, + "h": 1, + "content": "476,272" + }, + { + "id": "3b8158b7-70ed-45de-970d-cd774d9df25e", + "x": 2, + "y": 17, + "w": 1, + "h": 1, + "content": "127" + }, + { + "id": "99370fae-c111-4de2-96a9-6cc4298568a8", + "x": 2, + "y": 18, + "w": 1, + "h": 1, + "content": "970" + }, + { + "id": "1a1810ef-2540-4864-903d-17b54946d812", + "x": 2, + "y": 19, + "w": 1, + "h": 1, + "content": "479,984" + }, + { + "id": "2fb39f36-409d-4ffe-b26b-7d02b2658b34", + "x": 2, + "y": 20, + "w": 1, + "h": 1, + "content": "26,524" + }, + { + "id": "068b6e4c-1c7d-4bf9-bd46-4961a93d7828", + "x": 2, + "y": 21, + "w": 1, + "h": 1, + "content": "1,902" + }, + { + "id": "2366f69b-dc1c-4d09-ba51-ebd2967b7bc0", + "x": 2, + "y": 22, + "w": 1, + "h": 1, + "content": "\u2014" + }, + { + "id": "d9babc16-6049-4fb0-83f7-93f5f8caff79", + "x": 2, + "y": 23, + "w": 1, + "h": 1, + "content": "508,410" + }, + { + "id": "c15bffd8-845d-45fe-b06c-2e2f7ed6845a", + "x": 3, + "y": 0, + "w": 1, + "h": 1, + "content": "2021" + }, + { + "id": "635715bd-ef82-4f2f-af3a-bad37448a647", + "x": 3, + "y": 1, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "27ea8755-e1ae-4e95-a20e-fa4fe6e5bb7e", + "x": 3, + "y": 2, + "w": 1, + "h": 1, + "content": "297,217" + }, + { + "id": "08911b39-a522-4578-84f8-ae91f795e063", + "x": 3, + "y": 3, + "w": 1, + "h": 1, + "content": "71,651" + }, + { + "id": "1857f867-e92d-4a70-85b7-2ca6b9b7d2f8", + "x": 3, + "y": 4, + "w": 1, + "h": 1, + "content": "89,788" + }, + { + "id": "75436437-bec2-47c2-b2c1-a99159f1311e", + "x": 3, + "y": 5, + "w": 1, + "h": 1, + "content": "458,656" + }, + { + "id": "82333684-445e-4f4e-8e1b-aeea61d953c5", + "x": 3, + "y": 6, + "w": 1, + "h": 1, + "content": "22,203" + }, + { + "id": "a8aeacef-99dc-428d-b95c-6ab981bab1cb", + "x": 3, + "y": 7, + "w": 1, + "h": 1, + "content": "23,032" + }, + { + "id": "74410f40-f4c4-4f44-b7e5-9958c8cb8bab", + "x": 3, + "y": 8, + "w": 1, + "h": 1, + "content": "3,167" + }, + { + "id": "390d2fc9-f167-4b7b-b611-adb781cf9003", + "x": 3, + "y": 9, + "w": 1, + "h": 1, + "content": "555" + }, + { + "id": "f2d06cd8-4de0-4c8b-a215-5859d4a22a1f", + "x": 3, + "y": 10, + "w": 1, + "h": 1, + "content": "\u2014" + }, + { + "id": "59414f75-8b58-4c5b-9656-c27605fe8b29", + "x": 3, + "y": 11, + "w": 1, + "h": 1, + "content": "706" + }, + { + "id": "1073551b-fca8-45f4-9a1a-4443fbe5ce6a", + "x": 3, + "y": 12, + "w": 1, + "h": 1, + "content": "508,319" + }, + { + "id": "afe5fcf4-83de-41f3-9c01-9864fd3d104e", + "x": 3, + "y": 13, + "w": 1, + "h": 1, + "content": "" + }, + { + "id": "4d349793-595d-47c2-9d11-613aa78ffdd6", + "x": 3, + "y": 14, + "w": 1, + "h": 1, + "content": "1,066" + }, + { + "id": "f1942864-03aa-43ac-9196-4a4fce689882", + "x": 3, + "y": 15, + "w": 1, + "h": 1, + "content": "5,138" + }, + { + "id": "76733d69-53ff-418f-ad04-397c00a1c4af", + "x": 3, + "y": 16, + "w": 1, + "h": 1, + "content": "475,808" + }, + { + "id": "d3e41ea2-c8ec-44e6-8883-9bd7b0b2eabc", + "x": 3, + "y": 17, + "w": 1, + "h": 1, + "content": "205" + }, + { + "id": "3774efda-bddb-46ac-a172-004b405b9401", + "x": 3, + "y": 18, + "w": 1, + "h": 1, + "content": "761" + }, + { + "id": "c2db0a5e-c83e-4537-84c4-1b6916a053ba", + "x": 3, + "y": 19, + "w": 1, + "h": 1, + "content": "482,978" + }, + { + "id": "20cdfcb8-0691-41fd-97ec-cc1dcbb82695", + "x": 3, + "y": 20, + "w": 1, + "h": 1, + "content": "23,350" + }, + { + "id": "8ca488c3-bc8c-46b7-a742-7d3de4691aef", + "x": 3, + "y": 21, + "w": 1, + "h": 1, + "content": "1,991" + }, + { + "id": "fcae272e-ae3d-487a-b143-dbae95e41c56", + "x": 3, + "y": 22, + "w": 1, + "h": 1, + "content": "\u2014" + }, + { + "id": "b70f8af7-fa14-4ae0-9010-32756d5a6073", + "x": 3, + "y": 23, + "w": 1, + "h": 1, + "content": "508,319" + } + ] + }, + { + "type": "NarrativeText", + "text": "The accompanying notes are an integral part of these financial statements." + }, + { + "type": "NarrativeText", + "text": "These financial statements were signed by the Managing Director and the Director of Finance on June 24, 2022." + }, + { + "type": "Value" + }, + { + "type": "NarrativeText", + "text": "Kristalina Georgieva /s/ Managing Director" + }, + { + "type": "Value" + }, + { + "type": "NarrativeText", + "text": "Bernard Lauwers /s/ Director, Finance Department" + }, + { + "type": "PageNumber", + "text": 7 + }, + { + "type": "Footer" + } +] \ No newline at end of file diff --git a/example-docs/test_evaluate_files/unstructured_output_table_structure/2022-financial-statements-p11.pdf.json b/example-docs/test_evaluate_files/unstructured_output_table_structure/2022-financial-statements-p11.pdf.json new file mode 100644 index 0000000000..4f2043942c --- /dev/null +++ b/example-docs/test_evaluate_files/unstructured_output_table_structure/2022-financial-statements-p11.pdf.json @@ -0,0 +1,1416 @@ +[ + { + "element_id": "65cc604a285e9b4833c3515c205525f1", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 164.4, + 135.9 + ], + [ + 164.4, + 158.2 + ], + [ + 402.1, + 158.2 + ], + [ + 402.1, + 135.9 + ] + ], + "system": "PixelSpace" + }, + "detection_class_prob": 0.76059, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "I. General Department", + "type": "Header" + }, + { + "element_id": "525f06761160ee0502ed343a27144bf3", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 165.0, + 281.1 + ], + [ + 165.0, + 353.2 + ], + [ + 1281.2, + 353.2 + ], + [ + 1281.2, + 281.1 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1, + "parent_id": "65cc604a285e9b4833c3515c205525f1" + }, + "text": "Statements of Financial Position at April 30, 2022, and 2021 (in millions of SDRs)", + "type": "UncategorizedText" + }, + { + "element_id": "335f6e936ff6d431fb172bdcb3c45f62", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 178.3, + 368.5 + ], + [ + 178.3, + 1285.5 + ], + [ + 1565.2, + 1285.5 + ], + [ + 1565.2, + 368.5 + ] + ], + "system": "PixelSpace" + }, + "detection_class_prob": 0.92937, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1, + "parent_id": "65cc604a285e9b4833c3515c205525f1", + "text_as_html": "
Assets
Usable currencies292,280297,217
Other currencies69,40771,651
Credit outstanding593,03189,788
Total currencies454,718458,656
SDR holdings622,27022,203
Investments725,41823,032
Gold holdings93,1673,167
Property, plant and equipment and intangible assets10551555
Net assets under retirement benefit plans111,375_\u2014
Other assets12911706
Total assets508,410508,319
-iabilities
Special Contingent Account13\u20141,066
Borrowings142,6155,138
Quota subscriptions15476,272475,808
Net liabilities under retirement benefit plans11127205
Other liabilities12970761
otal li S479,984482,978
eserves of the General Resources Account1626,52423,350
Retained earnings of the Investment Account1,9021,991
Resources of the Special Disbursement Account\u2014\u2014
otal lial S, reserves, retained earnings, and resources508,410508,319
" + }, + "text": "Note 2022 2021 Usable currencies 292,280 297,217 Other currencies 69,407 71,651 Credit outstanding 5 93,031 89,788 Total currencies 454,718 458,656 SDR holdings 6 22,270 22,203 Investments 7 25,418 23,032 Gold holdings 9 3,167 3,167 Property, plant and equipment and intangible assets 10 551 555 Net assets under retirement benefit plans 11 1,375 \u2014 Other assets 12 911 706 508,410 508,319 Special Contingent Account 13 \u2014 1,066 Borrowings 14 2,615 5,138 Quota subscriptions 15 476,272 475,808 Net liabilities under retirement benefit plans 11 127 205 Other liabilities 12 970 761 479,984 482,978 16 26,524 23,350 1,902 1,991 \u2014 \u2014 Total liabilities, reserves, retained earnings, and resources 508,410 508,319", + "type": "Table" + }, + { + "element_id": "83333e79d07d5369c1ea81ac84fece49", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 173.0, + 427.6 + ], + [ + 173.0, + 448.4 + ], + [ + 247.1, + 448.4 + ], + [ + 247.1, + 427.6 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1, + "parent_id": "65cc604a285e9b4833c3515c205525f1" + }, + "text": "Assets", + "type": "Title" + }, + { + "element_id": "477f9805ddecc50afee64c176c77214d", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 453.3 + ], + [ + 163.0, + 458.7 + ], + [ + 1581.8, + 458.7 + ], + [ + 1581.8, + 453.3 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "ae3bfb88b2bce7580b08ba6cf81d2793", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 490.7 + ], + [ + 163.0, + 496.0 + ], + [ + 1581.8, + 496.0 + ], + [ + 1581.8, + 490.7 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "40d26315a107679ee6ba3a24e6ab3671", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 528.0 + ], + [ + 163.0, + 533.3 + ], + [ + 1581.8, + 533.3 + ], + [ + 1581.8, + 528.0 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "7c661779574aaac5f64a3896af6927d7", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 565.3 + ], + [ + 163.0, + 570.7 + ], + [ + 1581.8, + 570.7 + ], + [ + 1581.8, + 565.3 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "3547fc946bf088d451ca52fbb2487cac", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 602.7 + ], + [ + 163.0, + 608.0 + ], + [ + 1581.8, + 608.0 + ], + [ + 1581.8, + 602.7 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "df6263781f356939f404b5174102d255", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 640.2 + ], + [ + 163.0, + 645.5 + ], + [ + 1581.8, + 645.5 + ], + [ + 1581.8, + 640.2 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "dd09db523d1582839b2de71a9a5e8b12", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 677.5 + ], + [ + 163.0, + 682.8 + ], + [ + 1581.8, + 682.8 + ], + [ + 1581.8, + 677.5 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "8f1bb5f226d788b3e90b8f80020bffd2", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 714.8 + ], + [ + 163.0, + 720.2 + ], + [ + 1581.8, + 720.2 + ], + [ + 1581.8, + 714.8 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "764d462609915647d3759b1d83f274c5", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 752.2 + ], + [ + 163.0, + 757.5 + ], + [ + 1581.8, + 757.5 + ], + [ + 1581.8, + 752.2 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "4a56f843b424a3fa37380213b09caa70", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 789.5 + ], + [ + 163.0, + 794.8 + ], + [ + 1581.8, + 794.8 + ], + [ + 1581.8, + 789.5 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "4ab86a74a79c6de7444538a082b6bb1a", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 826.8 + ], + [ + 163.0, + 832.2 + ], + [ + 1052.3, + 832.2 + ], + [ + 1052.3, + 826.8 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "871c8a1cf2b1e6c6d00c0b1ba1ee40c6", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 1438.5, + 827.5 + ], + [ + 1438.5, + 831.5 + ], + [ + 1581.3, + 831.5 + ], + [ + 1581.3, + 827.5 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "6a23a5778b61a46218258943739d73ae", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 173.0, + 838.6 + ], + [ + 173.0, + 859.4 + ], + [ + 299.3, + 859.4 + ], + [ + 299.3, + 838.6 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "Total assets", + "type": "Title" + }, + { + "element_id": "0caa2844f3a44f1eb9cf1ec9c5b47a2a", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 173.0, + 881.9 + ], + [ + 173.0, + 902.7 + ], + [ + 275.0, + 902.7 + ], + [ + 275.0, + 881.9 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "Liabilities", + "type": "Title" + }, + { + "element_id": "0e820672ba8ab9c9a6794b5a5d6d3f77", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 906.2 + ], + [ + 163.0, + 911.5 + ], + [ + 1581.8, + 911.5 + ], + [ + 1581.8, + 906.2 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "f9b633213fd5d561ca3cf1c1a87b05fb", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 943.7 + ], + [ + 163.0, + 949.0 + ], + [ + 1581.8, + 949.0 + ], + [ + 1581.8, + 943.7 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "522c433f09c8953d1f32c0fdc2de5784", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 981.0 + ], + [ + 163.0, + 986.3 + ], + [ + 1581.8, + 986.3 + ], + [ + 1581.8, + 981.0 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "5ed918dcb33852f4f4594fc7d0f24f67", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 1018.3 + ], + [ + 163.0, + 1023.7 + ], + [ + 1581.8, + 1023.7 + ], + [ + 1581.8, + 1018.3 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "771cbe1b8698a307a53dcb7aa6d9adae", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 1055.7 + ], + [ + 163.0, + 1061.0 + ], + [ + 1581.8, + 1061.0 + ], + [ + 1581.8, + 1055.7 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "abd2d4f5b0f381be46e69dbee1350f68", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 1093.0 + ], + [ + 163.0, + 1098.3 + ], + [ + 1052.3, + 1098.3 + ], + [ + 1052.3, + 1093.0 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "05d62f9687aafdf4e682faf86157d1f5", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 1438.5, + 1093.7 + ], + [ + 1438.5, + 1097.7 + ], + [ + 1581.3, + 1097.7 + ], + [ + 1581.3, + 1093.7 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "64bc16b4cf986ed144be6d53e5ac952b", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 173.0, + 1104.7 + ], + [ + 173.0, + 1125.6 + ], + [ + 323.5, + 1125.6 + ], + [ + 323.5, + 1104.7 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "Total liabilities", + "type": "Title" + }, + { + "element_id": "1e06119c828c6c4389f59b56ea395d34", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 173.0, + 1142.1 + ], + [ + 173.0, + 1162.9 + ], + [ + 590.9, + 1162.9 + ], + [ + 590.9, + 1142.1 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "Reserves of the General Resources Account", + "type": "Title" + }, + { + "element_id": "7dd7d1fd11bc18b1e374549b576d1f29", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 1167.8 + ], + [ + 163.0, + 1173.2 + ], + [ + 1581.8, + 1173.2 + ], + [ + 1581.8, + 1167.8 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "26fab233fef39936e212028c23d5e453", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 173.0, + 1179.6 + ], + [ + 173.0, + 1200.4 + ], + [ + 594.5, + 1200.4 + ], + [ + 594.5, + 1179.6 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "Retained earnings of the Investment Account", + "type": "NarrativeText" + }, + { + "element_id": "d557775a955ec1af6de2e52f473d8419", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 1205.2 + ], + [ + 163.0, + 1210.5 + ], + [ + 1581.8, + 1210.5 + ], + [ + 1581.8, + 1205.2 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "f86bcd3d03727122b197d75c4e73720a", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 173.0, + 1216.9 + ], + [ + 173.0, + 1237.7 + ], + [ + 625.6, + 1237.7 + ], + [ + 625.6, + 1216.9 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "Resources of the Special Disbursement Account", + "type": "Title" + }, + { + "element_id": "7365c6ae9bf82f3638fd89674a017674", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 163.0, + 1242.5 + ], + [ + 163.0, + 1247.8 + ], + [ + 1052.3, + 1247.8 + ], + [ + 1052.3, + 1242.5 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "6b982880d0aa2a409eebd8bdad4c5ee7", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 1438.5, + 1243.2 + ], + [ + 1438.5, + 1247.2 + ], + [ + 1581.3, + 1247.2 + ], + [ + 1581.3, + 1243.2 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "", + "type": "Image" + }, + { + "element_id": "a61b273b6924ef539ab19372ec8f9d29", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 165.0, + 1300.0 + ], + [ + 165.0, + 1339.8 + ], + [ + 1071.4, + 1339.8 + ], + [ + 1071.4, + 1300.0 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "The accompanying notes are an integral part of these financial statements. These financial statements were signed by the Managing Director and the Director of Finance on June 24, 2022.", + "type": "NarrativeText" + }, + { + "element_id": "34420b5b7fa4d3125118bc071d5762e3", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 172.1, + 1607.0 + ], + [ + 172.1, + 1658.9 + ], + [ + 415.9, + 1658.9 + ], + [ + 415.9, + 1607.0 + ] + ], + "system": "PixelSpace" + }, + "detection_class_prob": 0.66378, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "Kristalina Georgieva /s/ Managing Director", + "type": "NarrativeText" + }, + { + "element_id": "73c1f3e3b4e289ab4fac59e1c7efb58d", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 992.5, + 1607.0 + ], + [ + 992.5, + 1659.1 + ], + [ + 1294.8, + 1659.1 + ], + [ + 1294.8, + 1607.0 + ] + ], + "system": "PixelSpace" + }, + "detection_class_prob": 0.72551, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "Bernard Lauwers /s/ Director, Finance Department", + "type": "NarrativeText" + }, + { + "element_id": "545721361a36a5a11ede506e3d442ccc", + "metadata": { + "coordinates": { + "layout_height": 2250, + "layout_width": 1750, + "points": [ + [ + 1572.7, + 2149.0 + ], + [ + 1572.7, + 2171.2 + ], + [ + 1591.2, + 2171.2 + ], + [ + 1591.2, + 2149.0 + ] + ], + "system": "PixelSpace" + }, + "file_directory": "/Users/yaoyou/Downloads/mini-holistic-all/src", + "filename": "2022-financial-statements-p11.pdf", + "filetype": "application/pdf", + "languages": [ + "eng" + ], + "last_modified": "2024-01-11T09:49:09", + "page_number": 1 + }, + "text": "7", + "type": "UncategorizedText" + } +] \ No newline at end of file diff --git a/test_unstructured/metrics/test_evaluate.py b/test_unstructured/metrics/test_evaluate.py index e797601249..df7607be4d 100644 --- a/test_unstructured/metrics/test_evaluate.py +++ b/test_unstructured/metrics/test_evaluate.py @@ -190,9 +190,16 @@ def test_table_structure_evaluation(): assert os.path.isfile(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv")) assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv")) df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t") - assert len(df) == 1 + agg_df = pd.read_csv( + os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"), sep="\t" + ).set_index("metric") + assert len(df) == 2 assert len(df.columns) == 17 - assert df.iloc[0].filename == "IRS-2023-Form-1095-A.pdf" + assert df.iloc[1].filename == "IRS-2023-Form-1095-A.pdf" + assert ( + np.round(np.average(df["table_level_acc"], weights=df["total_tables"]), 3) + == agg_df.loc["table_level_acc", "average"] + ) @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") From 1caee953a4a3792838d85c388dbffa8e68dd45bb Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 8 Jul 2024 15:09:17 -0500 Subject: [PATCH 5/6] fix: take false negative into consideration - false negative tables gets a 0 score for table level acc metric and a weight equal to 1 table per page - false negative tables do not contribute to other table metrics since there is no ground truth to evaluate structure or content of those tables --- .../2022-financial-statements-p11.pdf.json | 625 ++++++++++++++++++ test_unstructured/metrics/test_evaluate.py | 4 +- unstructured/metrics/evaluate.py | 17 +- unstructured/metrics/table/table_eval.py | 4 + 4 files changed, 646 insertions(+), 4 deletions(-) diff --git a/example-docs/test_evaluate_files/unstructured_output_table_structure/2022-financial-statements-p11.pdf.json b/example-docs/test_evaluate_files/unstructured_output_table_structure/2022-financial-statements-p11.pdf.json index 4f2043942c..4a1cc5c021 100644 --- a/example-docs/test_evaluate_files/unstructured_output_table_structure/2022-financial-statements-p11.pdf.json +++ b/example-docs/test_evaluate_files/unstructured_output_table_structure/2022-financial-statements-p11.pdf.json @@ -113,6 +113,631 @@ "last_modified": "2024-01-11T09:49:09", "page_number": 1, "parent_id": "65cc604a285e9b4833c3515c205525f1", + "table_as_cells": [ + { + "content": "Assets", + "h": 1, + "w": 1, + "x": 0, + "y": 0 + }, + { + "content": "Usable currencies", + "h": 1, + "w": 1, + "x": 0, + "y": 1 + }, + { + "content": "Other currencies", + "h": 1, + "w": 1, + "x": 0, + "y": 2 + }, + { + "content": "Credit outstanding", + "h": 1, + "w": 1, + "x": 0, + "y": 3 + }, + { + "content": "Total currencies", + "h": 1, + "w": 1, + "x": 0, + "y": 4 + }, + { + "content": "SDR holdings", + "h": 1, + "w": 1, + "x": 0, + "y": 5 + }, + { + "content": "Investments", + "h": 1, + "w": 1, + "x": 0, + "y": 6 + }, + { + "content": "Gold holdings", + "h": 1, + "w": 1, + "x": 0, + "y": 7 + }, + { + "content": "Property, plant and equipment and intangible assets", + "h": 1, + "w": 1, + "x": 0, + "y": 8 + }, + { + "content": "Net assets under retirement benefit plans", + "h": 1, + "w": 1, + "x": 0, + "y": 9 + }, + { + "content": "Other assets", + "h": 1, + "w": 1, + "x": 0, + "y": 10 + }, + { + "content": "Total assets", + "h": 1, + "w": 1, + "x": 0, + "y": 11 + }, + { + "content": "Special Contingent Account", + "h": 1, + "w": 1, + "x": 0, + "y": 13 + }, + { + "content": "Borrowings", + "h": 1, + "w": 1, + "x": 0, + "y": 14 + }, + { + "content": "Quota subscriptions", + "h": 1, + "w": 1, + "x": 0, + "y": 15 + }, + { + "content": "Net liabilities under retirement benefit plans", + "h": 1, + "w": 1, + "x": 0, + "y": 16 + }, + { + "content": "Other liabilities", + "h": 1, + "w": 1, + "x": 0, + "y": 17 + }, + { + "content": "otal li S", + "h": 1, + "w": 1, + "x": 0, + "y": 18 + }, + { + "content": "eserves of the General Resources Account", + "h": 1, + "w": 1, + "x": 0, + "y": 19 + }, + { + "content": "Retained earnings of the Investment Account", + "h": 1, + "w": 1, + "x": 0, + "y": 20 + }, + { + "content": "Resources of the Special Disbursement Account", + "h": 1, + "w": 1, + "x": 0, + "y": 21 + }, + { + "content": "otal lial S, reserves, retained earnings, and resources", + "h": 1, + "w": 1, + "x": 0, + "y": 22 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 0 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 1 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 2 + }, + { + "content": "5", + "h": 1, + "w": 1, + "x": 1, + "y": 3 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 4 + }, + { + "content": "6", + "h": 1, + "w": 1, + "x": 1, + "y": 5 + }, + { + "content": "7", + "h": 1, + "w": 1, + "x": 1, + "y": 6 + }, + { + "content": "9", + "h": 1, + "w": 1, + "x": 1, + "y": 7 + }, + { + "content": "10", + "h": 1, + "w": 1, + "x": 1, + "y": 8 + }, + { + "content": "11", + "h": 1, + "w": 1, + "x": 1, + "y": 9 + }, + { + "content": "12", + "h": 1, + "w": 1, + "x": 1, + "y": 10 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 11 + }, + { + "content": "13", + "h": 1, + "w": 1, + "x": 1, + "y": 13 + }, + { + "content": "14", + "h": 1, + "w": 1, + "x": 1, + "y": 14 + }, + { + "content": "15", + "h": 1, + "w": 1, + "x": 1, + "y": 15 + }, + { + "content": "11", + "h": 1, + "w": 1, + "x": 1, + "y": 16 + }, + { + "content": "12", + "h": 1, + "w": 1, + "x": 1, + "y": 17 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 18 + }, + { + "content": "16", + "h": 1, + "w": 1, + "x": 1, + "y": 19 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 20 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 21 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 1, + "y": 22 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 2, + "y": 0 + }, + { + "content": "292,280", + "h": 1, + "w": 1, + "x": 2, + "y": 1 + }, + { + "content": "69,407", + "h": 1, + "w": 1, + "x": 2, + "y": 2 + }, + { + "content": "93,031", + "h": 1, + "w": 1, + "x": 2, + "y": 3 + }, + { + "content": "454,718", + "h": 1, + "w": 1, + "x": 2, + "y": 4 + }, + { + "content": "22,270", + "h": 1, + "w": 1, + "x": 2, + "y": 5 + }, + { + "content": "25,418", + "h": 1, + "w": 1, + "x": 2, + "y": 6 + }, + { + "content": "3,167", + "h": 1, + "w": 1, + "x": 2, + "y": 7 + }, + { + "content": "551", + "h": 1, + "w": 1, + "x": 2, + "y": 8 + }, + { + "content": "1,375", + "h": 1, + "w": 1, + "x": 2, + "y": 9 + }, + { + "content": "911", + "h": 1, + "w": 1, + "x": 2, + "y": 10 + }, + { + "content": "508,410", + "h": 1, + "w": 1, + "x": 2, + "y": 11 + }, + { + "content": "\u2014", + "h": 1, + "w": 1, + "x": 2, + "y": 13 + }, + { + "content": "2,615", + "h": 1, + "w": 1, + "x": 2, + "y": 14 + }, + { + "content": "476,272", + "h": 1, + "w": 1, + "x": 2, + "y": 15 + }, + { + "content": "127", + "h": 1, + "w": 1, + "x": 2, + "y": 16 + }, + { + "content": "970", + "h": 1, + "w": 1, + "x": 2, + "y": 17 + }, + { + "content": "479,984", + "h": 1, + "w": 1, + "x": 2, + "y": 18 + }, + { + "content": "26,524", + "h": 1, + "w": 1, + "x": 2, + "y": 19 + }, + { + "content": "1,902", + "h": 1, + "w": 1, + "x": 2, + "y": 20 + }, + { + "content": "\u2014", + "h": 1, + "w": 1, + "x": 2, + "y": 21 + }, + { + "content": "508,410", + "h": 1, + "w": 1, + "x": 2, + "y": 22 + }, + { + "content": "", + "h": 1, + "w": 1, + "x": 3, + "y": 0 + }, + { + "content": "297,217", + "h": 1, + "w": 1, + "x": 3, + "y": 1 + }, + { + "content": "71,651", + "h": 1, + "w": 1, + "x": 3, + "y": 2 + }, + { + "content": "89,788", + "h": 1, + "w": 1, + "x": 3, + "y": 3 + }, + { + "content": "458,656", + "h": 1, + "w": 1, + "x": 3, + "y": 4 + }, + { + "content": "22,203", + "h": 1, + "w": 1, + "x": 3, + "y": 5 + }, + { + "content": "23,032", + "h": 1, + "w": 1, + "x": 3, + "y": 6 + }, + { + "content": "3,167", + "h": 1, + "w": 1, + "x": 3, + "y": 7 + }, + { + "content": "555", + "h": 1, + "w": 1, + "x": 3, + "y": 8 + }, + { + "content": "_\u2014", + "h": 1, + "w": 1, + "x": 3, + "y": 9 + }, + { + "content": "706", + "h": 1, + "w": 1, + "x": 3, + "y": 10 + }, + { + "content": "508,319", + "h": 1, + "w": 1, + "x": 3, + "y": 11 + }, + { + "content": "1,066", + "h": 1, + "w": 1, + "x": 3, + "y": 13 + }, + { + "content": "5,138", + "h": 1, + "w": 1, + "x": 3, + "y": 14 + }, + { + "content": "475,808", + "h": 1, + "w": 1, + "x": 3, + "y": 15 + }, + { + "content": "205", + "h": 1, + "w": 1, + "x": 3, + "y": 16 + }, + { + "content": "761", + "h": 1, + "w": 1, + "x": 3, + "y": 17 + }, + { + "content": "482,978", + "h": 1, + "w": 1, + "x": 3, + "y": 18 + }, + { + "content": "23,350", + "h": 1, + "w": 1, + "x": 3, + "y": 19 + }, + { + "content": "1,991", + "h": 1, + "w": 1, + "x": 3, + "y": 20 + }, + { + "content": "\u2014", + "h": 1, + "w": 1, + "x": 3, + "y": 21 + }, + { + "content": "508,319", + "h": 1, + "w": 1, + "x": 3, + "y": 22 + }, + { + "content": "-iabilities", + "h": 1, + "w": 4, + "x": 0, + "y": 12 + } + ], "text_as_html": "
Assets
Usable currencies292,280297,217
Other currencies69,40771,651
Credit outstanding593,03189,788
Total currencies454,718458,656
SDR holdings622,27022,203
Investments725,41823,032
Gold holdings93,1673,167
Property, plant and equipment and intangible assets10551555
Net assets under retirement benefit plans111,375_\u2014
Other assets12911706
Total assets508,410508,319
-iabilities
Special Contingent Account13\u20141,066
Borrowings142,6155,138
Quota subscriptions15476,272475,808
Net liabilities under retirement benefit plans11127205
Other liabilities12970761
otal li S479,984482,978
eserves of the General Resources Account1626,52423,350
Retained earnings of the Investment Account1,9021,991
Resources of the Special Disbursement Account\u2014\u2014
otal lial S, reserves, retained earnings, and resources508,410508,319
" }, "text": "Note 2022 2021 Usable currencies 292,280 297,217 Other currencies 69,407 71,651 Credit outstanding 5 93,031 89,788 Total currencies 454,718 458,656 SDR holdings 6 22,270 22,203 Investments 7 25,418 23,032 Gold holdings 9 3,167 3,167 Property, plant and equipment and intangible assets 10 551 555 Net assets under retirement benefit plans 11 1,375 \u2014 Other assets 12 911 706 508,410 508,319 Special Contingent Account 13 \u2014 1,066 Borrowings 14 2,615 5,138 Quota subscriptions 15 476,272 475,808 Net liabilities under retirement benefit plans 11 127 205 Other liabilities 12 970 761 479,984 482,978 16 26,524 23,350 1,902 1,991 \u2014 \u2014 Total liabilities, reserves, retained earnings, and resources 508,410 508,319", diff --git a/test_unstructured/metrics/test_evaluate.py b/test_unstructured/metrics/test_evaluate.py index df7607be4d..fe9274fbe5 100644 --- a/test_unstructured/metrics/test_evaluate.py +++ b/test_unstructured/metrics/test_evaluate.py @@ -115,7 +115,7 @@ def test_text_extraction_evaluation(): UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME, GOLD_TABLE_STRUCTURE_DIRNAME, Path("IRS-2023-Form-1095-A.pdf.json"), - 17, + 18, {}, ), ( @@ -194,7 +194,7 @@ def test_table_structure_evaluation(): os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"), sep="\t" ).set_index("metric") assert len(df) == 2 - assert len(df.columns) == 17 + assert len(df.columns) == 19 assert df.iloc[1].filename == "IRS-2023-Form-1095-A.pdf" assert ( np.round(np.average(df["table_level_acc"], weights=df["total_tables"]), 3) diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index f5bb5a8131..049bce36fb 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -247,6 +247,7 @@ def _process_document(self, doc: Path) -> list: out_filename, doctype, connector, + report_from_cells.total_predicted_tables, ] + [getattr(report_from_html, metric) for metric in self.supported_metric_names] + [getattr(report_from_cells, metric) for metric in self.supported_metric_names] @@ -262,10 +263,15 @@ def _generate_dataframes(self, rows): "filename", "doctype", "connector", + "total_predicted_tables", ] + combined_table_metrics df = pd.DataFrame(rows, columns=headers) - has_tables_df = df[df["total_tables"] > 0] + df["_table_weights"] = df["total_tables"] + # we give false positive tables a 1 table worth of weight in computing table level acc + df["_table_weights"][(df["total_tables"] == 0) & (df["total_predicted_tables"] > 0)] = 1 + # filter down to only those with actual and/or predicted tables + has_tables_df = df[df["_table_weights"] > 0] if has_tables_df.empty: agg_df = pd.DataFrame( @@ -276,9 +282,16 @@ def _generate_dataframes(self, rows): for metric in combined_table_metrics: metric_df = has_tables_df[has_tables_df[metric].notnull()] agg_metric = metric_df[metric].agg([_stdev, _pstdev, _count]).transpose() - if metric == "total_tables": + if metric.startswith("total_tables"): agg_metric["_mean"] = metric_df[metric].mean() + elif metric.startswith("table_level_acc"): + agg_metric["_mean"] = np.round( + np.average(metric_df[metric], weights=metric_df["_table_weights"]), + 3, + ) else: + # false positive tables do not contribute to table structure and content + # extraction metrics agg_metric["_mean"] = np.round( np.average(metric_df[metric], weights=metric_df["total_tables"]), 3, diff --git a/unstructured/metrics/table/table_eval.py b/unstructured/metrics/table/table_eval.py index a25cf30c3d..b3d37197a4 100644 --- a/unstructured/metrics/table/table_eval.py +++ b/unstructured/metrics/table/table_eval.py @@ -41,6 +41,7 @@ class TableEvaluation: """Class representing a gathered table metrics.""" total_tables: int + total_predicted_tables: int table_level_acc: float element_col_level_index_acc: float element_row_level_index_acc: float @@ -208,6 +209,7 @@ def process_file(self) -> TableEvaluation: table_acc = 1 if not is_table_predicted else 0 return TableEvaluation( total_tables=0, + total_predicted_tables=len(predicted_table_data), table_level_acc=table_acc, element_col_level_index_acc=score, element_row_level_index_acc=score, @@ -217,6 +219,7 @@ def process_file(self) -> TableEvaluation: if is_table_in_gt and not is_table_predicted: return TableEvaluation( total_tables=len(ground_truth_table_data), + total_predicted_tables=0, table_level_acc=0, element_col_level_index_acc=0, element_row_level_index_acc=0, @@ -242,6 +245,7 @@ def process_file(self) -> TableEvaluation: evaluation = TableEvaluation( total_tables=len(ground_truth_table_data), + total_predicted_tables=len(predicted_table_data), table_level_acc=predicted_table_acc, element_col_level_index_acc=metrics.get("col_index_acc", 0), element_row_level_index_acc=metrics.get("row_index_acc", 0), From 0381edf6b236862d8ac7292ba4aee1c031de33db Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 9 Jul 2024 10:42:00 -0500 Subject: [PATCH 6/6] fix: use a cleaner pandas syntax --- unstructured/metrics/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py index 0f76f63f02..718ac96091 100755 --- a/unstructured/metrics/evaluate.py +++ b/unstructured/metrics/evaluate.py @@ -255,7 +255,7 @@ def _generate_dataframes(self, rows): df = pd.DataFrame(rows, columns=headers) df["_table_weights"] = df["total_tables"] # we give false positive tables a 1 table worth of weight in computing table level acc - df["_table_weights"][(df["total_tables"] == 0) & (df["total_predicted_tables"] > 0)] = 1 + df["_table_weights"][df.total_tables.eq(0) & df.total_predicted_tables.gt(0)] = 1 # filter down to only those with actual and/or predicted tables has_tables_df = df[df["_table_weights"] > 0]