From f7a9af8149ca19405b513de9484318c8595a91b0 Mon Sep 17 00:00:00 2001 From: Michal Martyniak Date: Mon, 22 Apr 2024 14:48:10 +0200 Subject: [PATCH] debug: print traceback only for azure src ingestion --- test_unstructured_ingest/test-ingest-src.sh | 88 ++++++++++----------- unstructured/documents/elements.py | 7 ++ 2 files changed, 51 insertions(+), 44 deletions(-) diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh index 7ceba1480d..aad941fb60 100755 --- a/test_unstructured_ingest/test-ingest-src.sh +++ b/test_unstructured_ingest/test-ingest-src.sh @@ -17,51 +17,51 @@ EVAL_OUTPUT_ROOT=${EVAL_OUTPUT_ROOT:-$SCRIPT_DIR} export OMP_THREAD_LIMIT=1 all_tests=( - 's3.sh' - 's3-minio.sh' + # 's3.sh' + # 's3-minio.sh' 'azure.sh' - 'biomed-api.sh' - 'biomed-path.sh' - # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files - 'pdf-fast-reprocess.sh' - 'salesforce.sh' - 'box.sh' - 'discord.sh' - 'dropbox.sh' - 'github.sh' - 'gitlab.sh' - 'google-drive.sh' - 'wikipedia.sh' - 'local.sh' - 'slack.sh' - 'against-api.sh' - 'gcs.sh' - 'onedrive.sh' - 'outlook.sh' - 'elasticsearch.sh' - 'confluence-diff.sh' - 'confluence-large.sh' - 'airtable-diff.sh' - # NOTE(ryan): This test is disabled because it is triggering too many requests to the API - # 'airtable-large.sh' - 'local-single-file.sh' - 'local-single-file-basic-chunking.sh' - 'local-single-file-chunk-no-orig-elements.sh' - 'local-single-file-with-encoding.sh' - 'local-single-file-with-pdf-infer-table-structure.sh' - 'notion.sh' - 'delta-table.sh' - 'jira.sh' - 'sharepoint.sh' - 'sharepoint-with-permissions.sh' - 'hubspot.sh' - 'local-embed.sh' - 'local-embed-bedrock.sh' - 'local-embed-octoai.sh' - 'local-embed-vertexai.sh' - 'sftp.sh' - 'mongodb.sh' - 'opensearch.sh' + # 'biomed-api.sh' + # 'biomed-path.sh' + # # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files + # 'pdf-fast-reprocess.sh' + # 'salesforce.sh' + # 'box.sh' + # 'discord.sh' + # 'dropbox.sh' + # 'github.sh' + # 'gitlab.sh' + # 'google-drive.sh' + # 'wikipedia.sh' + # 'local.sh' + # 'slack.sh' + # 'against-api.sh' + # 'gcs.sh' + # 'onedrive.sh' + # 'outlook.sh' + # 'elasticsearch.sh' + # 'confluence-diff.sh' + # 'confluence-large.sh' + # 'airtable-diff.sh' + # # NOTE(ryan): This test is disabled because it is triggering too many requests to the API + # # 'airtable-large.sh' + # 'local-single-file.sh' + # 'local-single-file-basic-chunking.sh' + # 'local-single-file-chunk-no-orig-elements.sh' + # 'local-single-file-with-encoding.sh' + # 'local-single-file-with-pdf-infer-table-structure.sh' + # 'notion.sh' + # 'delta-table.sh' + # 'jira.sh' + # 'sharepoint.sh' + # 'sharepoint-with-permissions.sh' + # 'hubspot.sh' + # 'local-embed.sh' + # 'local-embed-bedrock.sh' + # 'local-embed-octoai.sh' + # 'local-embed-vertexai.sh' + # 'sftp.sh' + # 'mongodb.sh' + # 'opensearch.sh' ) full_python_matrix_tests=( diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index a29b9ea034..9ebe073c7c 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -10,6 +10,8 @@ import os import pathlib import re +import sys +import traceback import uuid from itertools import groupby from types import MappingProxyType @@ -522,10 +524,15 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]: ] # -- assign hash IDs to elements -- + print("before") + print([e._element_id for e in elements]) old_to_new_mapping = { element.id: element.id_to_hash(seq_on_page_counter) for element, seq_on_page_counter in zip(elements, page_seq_pairs) } + print("after") + print([e._element_id for e in elements]) + traceback.print_stack(file=sys.stdout) # -- map old parent IDs to new ones -- for e in elements: