Skip to content

Commit

Permalink
finish setting up e2e test for gcs
Browse files Browse the repository at this point in the history
  • Loading branch information
rbiseck3 committed Oct 18, 2023
1 parent da207c5 commit 53c27aa
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 11 deletions.
18 changes: 12 additions & 6 deletions test_unstructured_ingest/test-ingest-gcs-dest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ OUTPUT_FOLDER_NAME=gcs-dest
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
DESTINATION_GCS="gs://utic-ingest-test-fixtures-output/$(date +%s)/"
DESTINATION_GCS="gs://utic-test-ingest-fixtures-output/$(date +%s)"
CI=${CI:-"false"}

if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
Expand All @@ -28,6 +28,12 @@ function cleanup() {
if [ "$CI" == "true" ]; then
cleanup_dir "$DOWNLOAD_DIR"
fi

if gcloud storage ls "$DESTINATION_GCS"; then
echo "deleting $DESTINATION_GCS"
gcloud storage rm --recursive "$DESTINATION_GCS"
fi

}

trap cleanup EXIT
Expand All @@ -48,8 +54,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \

# Simply check the number of files uploaded
expected_num_files=1
#num_files_in_s3=$(aws s3 ls "$DESTINATION_S3/example-docs/" --region us-east-2 | wc -l)
#if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then
# echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files."
# exit 1
#fi
num_files_in_gcs=$(gcloud storage ls "$DESTINATION_GCS"/example-docs/ | wc -l )
if [ "$num_files_in_gcs" -ne "$expected_num_files" ]; then
echo "Expected $expected_num_files files to be uploaded to gcs, but found $num_files_in_gcs files."
exit 1
fi
12 changes: 8 additions & 4 deletions unstructured/ingest/connector/fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,13 @@ def write_dict(

logger.info(f"Writing content using filesystem: {type(fs).__name__}")

dest_folder = self.connector_config.path_without_protocol
dest_output_path = str(PurePath(dest_folder, filename)) if filename else dest_folder
full_dest_path = f"{self.connector_config.protocol}://{dest_output_path}"
output_folder = self.connector_config.path_without_protocol
output_folder = os.path.join(output_folder) # Make sure folder ends with file seperator
filename = (
filename.strip(os.sep) if filename else filename
) # Make sure filename doesn't begin with file seperator
output_path = str(PurePath(output_folder, filename)) if filename else output_folder
full_dest_path = f"{self.connector_config.protocol}://{output_path}"
logger.debug(f"uploading content to {full_dest_path}")
fs.write_text(
full_dest_path,
Expand All @@ -269,7 +273,7 @@ def write_dict(

def write(self, docs: t.List[BaseIngestDoc]) -> None:
for doc in docs:
file_path = doc.base_filename
file_path = doc.base_output_filename
filename = file_path if file_path else None
with open(doc._output_filename) as json_file:
logger.debug(f"uploading content from {doc._output_filename}")
Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/connector/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _output_filename(self) -> Path:
"""
input_path = Path(self.connector_config.input_path)
basename = (
f"{Path(self.path).name}.json"
f"{self.base_filename}.json"
if input_path.is_file()
else f"{Path(self.path).relative_to(input_path)}.json"
)
Expand Down
9 changes: 9 additions & 0 deletions unstructured/ingest/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,15 @@ def base_filename(self) -> t.Optional[str]:
return base_path
return None

@property
def base_output_filename(self) -> t.Optional[str]:
if self.processor_config.output_dir and self._output_filename:
output_path = str(Path(self.processor_config.output_dir).resolve())
full_path = str(self._output_filename)
base_path = full_path.replace(output_path, "")
return base_path
return None

@property
@abstractmethod
def _output_filename(self):
Expand Down

0 comments on commit 53c27aa

Please sign in to comment.