diff --git a/test_unstructured_ingest/test-ingest-gcs-dest.sh b/test_unstructured_ingest/test-ingest-gcs-dest.sh index 247d813e30..2437b68051 100755 --- a/test_unstructured_ingest/test-ingest-gcs-dest.sh +++ b/test_unstructured_ingest/test-ingest-gcs-dest.sh @@ -8,7 +8,7 @@ OUTPUT_FOLDER_NAME=gcs-dest OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -DESTINATION_GCS="gs://utic-ingest-test-fixtures-output/$(date +%s)/" +DESTINATION_GCS="gs://utic-test-ingest-fixtures-output/$(date +%s)" CI=${CI:-"false"} if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then @@ -28,6 +28,12 @@ function cleanup() { if [ "$CI" == "true" ]; then cleanup_dir "$DOWNLOAD_DIR" fi + + if gcloud storage ls "$DESTINATION_GCS"; then + echo "deleting $DESTINATION_GCS" + gcloud storage rm --recursive "$DESTINATION_GCS" + fi + } trap cleanup EXIT @@ -48,8 +54,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ # Simply check the number of files uploaded expected_num_files=1 -#num_files_in_s3=$(aws s3 ls "$DESTINATION_S3/example-docs/" --region us-east-2 | wc -l) -#if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then -# echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files." -# exit 1 -#fi +num_files_in_gcs=$(gcloud storage ls "$DESTINATION_GCS"/example-docs/ | wc -l ) +if [ "$num_files_in_gcs" -ne "$expected_num_files" ]; then + echo "Expected $expected_num_files files to be uploaded to gcs, but found $num_files_in_gcs files." + exit 1 +fi diff --git a/unstructured/ingest/connector/fsspec.py b/unstructured/ingest/connector/fsspec.py index 2ffbebca55..1b3cf5bfbc 100644 --- a/unstructured/ingest/connector/fsspec.py +++ b/unstructured/ingest/connector/fsspec.py @@ -256,9 +256,13 @@ def write_dict( logger.info(f"Writing content using filesystem: {type(fs).__name__}") - dest_folder = self.connector_config.path_without_protocol - dest_output_path = str(PurePath(dest_folder, filename)) if filename else dest_folder - full_dest_path = f"{self.connector_config.protocol}://{dest_output_path}" + output_folder = self.connector_config.path_without_protocol + output_folder = os.path.join(output_folder) # Make sure folder ends with file seperator + filename = ( + filename.strip(os.sep) if filename else filename + ) # Make sure filename doesn't begin with file seperator + output_path = str(PurePath(output_folder, filename)) if filename else output_folder + full_dest_path = f"{self.connector_config.protocol}://{output_path}" logger.debug(f"uploading content to {full_dest_path}") fs.write_text( full_dest_path, @@ -269,7 +273,7 @@ def write_dict( def write(self, docs: t.List[BaseIngestDoc]) -> None: for doc in docs: - file_path = doc.base_filename + file_path = doc.base_output_filename filename = file_path if file_path else None with open(doc._output_filename) as json_file: logger.debug(f"uploading content from {doc._output_filename}") diff --git a/unstructured/ingest/connector/local.py b/unstructured/ingest/connector/local.py index 88688bb723..2d4ee13bb0 100644 --- a/unstructured/ingest/connector/local.py +++ b/unstructured/ingest/connector/local.py @@ -65,7 +65,7 @@ def _output_filename(self) -> Path: """ input_path = Path(self.connector_config.input_path) basename = ( - f"{Path(self.path).name}.json" + f"{self.base_filename}.json" if input_path.is_file() else f"{Path(self.path).relative_to(input_path)}.json" ) diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index 1986a0c567..f51ed69ca1 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -290,6 +290,15 @@ def base_filename(self) -> t.Optional[str]: return base_path return None + @property + def base_output_filename(self) -> t.Optional[str]: + if self.processor_config.output_dir and self._output_filename: + output_path = str(Path(self.processor_config.output_dir).resolve()) + full_path = str(self._output_filename) + base_path = full_path.replace(output_path, "") + return base_path + return None + @property @abstractmethod def _output_filename(self):