Skip to content

Commit

Permalink
debug: print traceback only for azure src ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
micmarty-deepsense committed Apr 22, 2024
1 parent cc8be15 commit f7a9af8
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 44 deletions.
88 changes: 44 additions & 44 deletions test_unstructured_ingest/test-ingest-src.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,51 +17,51 @@ EVAL_OUTPUT_ROOT=${EVAL_OUTPUT_ROOT:-$SCRIPT_DIR}
export OMP_THREAD_LIMIT=1

all_tests=(
's3.sh'
's3-minio.sh'
# 's3.sh'
# 's3-minio.sh'
'azure.sh'
'biomed-api.sh'
'biomed-path.sh'
# NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files
'pdf-fast-reprocess.sh'
'salesforce.sh'
'box.sh'
'discord.sh'
'dropbox.sh'
'github.sh'
'gitlab.sh'
'google-drive.sh'
'wikipedia.sh'
'local.sh'
'slack.sh'
'against-api.sh'
'gcs.sh'
'onedrive.sh'
'outlook.sh'
'elasticsearch.sh'
'confluence-diff.sh'
'confluence-large.sh'
'airtable-diff.sh'
# NOTE(ryan): This test is disabled because it is triggering too many requests to the API
# 'airtable-large.sh'
'local-single-file.sh'
'local-single-file-basic-chunking.sh'
'local-single-file-chunk-no-orig-elements.sh'
'local-single-file-with-encoding.sh'
'local-single-file-with-pdf-infer-table-structure.sh'
'notion.sh'
'delta-table.sh'
'jira.sh'
'sharepoint.sh'
'sharepoint-with-permissions.sh'
'hubspot.sh'
'local-embed.sh'
'local-embed-bedrock.sh'
'local-embed-octoai.sh'
'local-embed-vertexai.sh'
'sftp.sh'
'mongodb.sh'
'opensearch.sh'
# 'biomed-api.sh'
# 'biomed-path.sh'
# # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files
# 'pdf-fast-reprocess.sh'
# 'salesforce.sh'
# 'box.sh'
# 'discord.sh'
# 'dropbox.sh'
# 'github.sh'
# 'gitlab.sh'
# 'google-drive.sh'
# 'wikipedia.sh'
# 'local.sh'
# 'slack.sh'
# 'against-api.sh'
# 'gcs.sh'
# 'onedrive.sh'
# 'outlook.sh'
# 'elasticsearch.sh'
# 'confluence-diff.sh'
# 'confluence-large.sh'
# 'airtable-diff.sh'
# # NOTE(ryan): This test is disabled because it is triggering too many requests to the API
# # 'airtable-large.sh'
# 'local-single-file.sh'
# 'local-single-file-basic-chunking.sh'
# 'local-single-file-chunk-no-orig-elements.sh'
# 'local-single-file-with-encoding.sh'
# 'local-single-file-with-pdf-infer-table-structure.sh'
# 'notion.sh'
# 'delta-table.sh'
# 'jira.sh'
# 'sharepoint.sh'
# 'sharepoint-with-permissions.sh'
# 'hubspot.sh'
# 'local-embed.sh'
# 'local-embed-bedrock.sh'
# 'local-embed-octoai.sh'
# 'local-embed-vertexai.sh'
# 'sftp.sh'
# 'mongodb.sh'
# 'opensearch.sh'
)

full_python_matrix_tests=(
Expand Down
7 changes: 7 additions & 0 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import os
import pathlib
import re
import sys
import traceback
import uuid
from itertools import groupby
from types import MappingProxyType
Expand Down Expand Up @@ -522,10 +524,15 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]:
]

# -- assign hash IDs to elements --
print("before")
print([e._element_id for e in elements])
old_to_new_mapping = {
element.id: element.id_to_hash(seq_on_page_counter)
for element, seq_on_page_counter in zip(elements, page_seq_pairs)
}
print("after")
print([e._element_id for e in elements])
traceback.print_stack(file=sys.stdout)

# -- map old parent IDs to new ones --
for e in elements:
Expand Down

0 comments on commit f7a9af8

Please sign in to comment.