diff --git a/docs/data_explorer.md b/docs/data_explorer.md index 7388d2c5c..22f21060b 100644 --- a/docs/data_explorer.md +++ b/docs/data_explorer.md @@ -30,7 +30,7 @@ You can setup the data explorer container with the `fondant explore` CLI command Where the base path can be either a local or remote base path. Make sure to pass the proper mount credentials arguments when using a remote base path or a local base path that references remote datasets. You can do that either with `--auth-gcp`, `--auth-aws` or `--auth-azure` to -mount your default local cloud credentials to the pipeline. Or You can also use the `--credentials` argument to mount custom credentials to the local container pipeline. +mount your default local cloud credentials to the pipeline. Or You can also use the `--extra-volumnes` flag to specify credentials or local files you need to mount. Example: diff --git a/src/fondant/cli.py b/src/fondant/cli.py index 0fb870997..311b372cb 100644 --- a/src/fondant/cli.py +++ b/src/fondant/cli.py @@ -42,8 +42,6 @@ def get_cloud_credentials(args) -> t.Optional[str]: return CloudCredentialsMount.AWS.value if args.auth_azure: return CloudCredentialsMount.AZURE.value - if args.credentials: - return args.credentials return None @@ -103,7 +101,7 @@ def register_explore(parent_parser): This will spin up a docker container that hosts a web application that allows you to explore the data produced by a fondant pipeline. The default address is http://localhost:8501. You can choose both a local and remote base path to explore. If the data that you want to explore is stored remotely, you - should use the --credentials flag to specify the path to a file that contains the credentials to access remote data (for S3, GCS, etc). + should use the --extra-volumes flag to specify credentials or local files you need to mount. Example: @@ -165,17 +163,11 @@ def register_explore(parent_parser): ) auth_group.add_argument( - "--credentials", - "-c", - type=str, - default=None, - help="""Path mapping of the source (local) and target (docker file system) - credential paths in the format of src:target - More info on - Google Cloud: https://cloud.google.com/docs/authentication/application-default-credentials - AWS: https://docs.aws.amazon.com/sdkref/latest/guide/file-location.html - Azure: https://stackoverflow.com/questions/69010943/how-does-az-login-store-credential-information - """, + "--extra-volumes", + help="""Extra volumes to mount in containers. You can use the --extra-volumes flag to specify extra volumes to mount in the containers this can be used: + - to mount data directories to be used by the pipeline (note that if your pipeline's base_path is local it will already be mounted for you). + - to mount cloud credentials""", + nargs="+", ) parser.set_defaults(func=explore) @@ -189,8 +181,16 @@ def explore(args): "Docker runtime not found. Please install Docker and try again.", ) + extra_volumes = [] + cloud_cred = get_cloud_credentials(args) + if cloud_cred: + extra_volumes.append(cloud_cred) + + if args.extra_volumes: + extra_volumes.extend(args.extra_volumes) + run_explorer_app( base_path=args.base_path, container=args.container, diff --git a/src/fondant/explore.py b/src/fondant/explore.py index 31aae48b4..3c9a7fcc5 100644 --- a/src/fondant/explore.py +++ b/src/fondant/explore.py @@ -15,9 +15,28 @@ def run_explorer_app( # type: ignore port: int = 8501, container: str = "fndnt/data_explorer", tag: str = "latest", - credentials: t.Optional[str] = None, + extra_volumes: t.Union[t.Optional[list], t.Optional[str]] = None, ): # type: ignore - """Run the data explorer container.""" + """ + Run an Explorer App in a Docker container. + + Args: + base_path: the base path where the Explorer App will be mounted. + port: The port number to expose the Explorer App. Default is 8501. + container: The Docker container name or image to use. Default is "fndnt/data_explorer". + tag: The tag/version of the Docker container. Default is "latest". + extra_volumes: Extra volumes to mount in containers. You can use the --extra-volumes flag + to specify extra volumes to mount in the containers this can be used: + - to mount data directories to be used by the pipeline (note that if your pipeline's + base_path is local it will already be mounted for you). + - to mount cloud credentials + """ + if extra_volumes is None: + extra_volumes = [] + + if isinstance(extra_volumes, str): + extra_volumes = [extra_volumes] + cmd = [ "docker", "run", @@ -30,34 +49,23 @@ def run_explorer_app( # type: ignore f"{port}:8501", ] - # mount the credentials file to the container - if credentials: - # check if path is read only - if not credentials.endswith(":ro"): - credentials += ":ro" - - cmd.extend( - [ - "-v", - credentials, - ], - ) - fs, _ = fsspec.core.url_to_fs(base_path) if isinstance(fs, LocalFileSystem): logging.info(f"Using local base path: {base_path}") logging.info( "This directory will be mounted to /artifacts in the container.", ) - if not credentials: - logging.warning( - "You have not provided a credentials file. If you wish to access data " - "from a cloud provider, mount the credentials file with the --credentials flag.", - ) data_directory_path = Path(base_path).resolve() host_machine_path = data_directory_path.as_posix() container_path = os.path.join("/", data_directory_path.name) + # Mount extra volumes to the container + if extra_volumes: + for volume in extra_volumes: + cmd.extend( + ["-v", volume], + ) + # Mount the local base path to the container cmd.extend( ["-v", f"{shlex.quote(host_machine_path)}:{shlex.quote(container_path)}"], @@ -75,7 +83,7 @@ def run_explorer_app( # type: ignore ) else: - if credentials is None: + if not extra_volumes: raise RuntimeError( None, f"Specified base path `{base_path}`, Please provide valid credentials when using" @@ -84,6 +92,13 @@ def run_explorer_app( # type: ignore logging.info(f"Using remote base path: {base_path}") + # Mount extra volumes to the container + if extra_volumes: + for volume in extra_volumes: + cmd.extend( + ["-v", volume], + ) + # add the image name cmd.extend( [ diff --git a/tests/test_explorer.py b/tests/test_explorer.py index 34e3432a2..0ba10dc42 100644 --- a/tests/test_explorer.py +++ b/tests/test_explorer.py @@ -20,7 +20,7 @@ def remote_path() -> str: @pytest.fixture() -def credentials() -> str: +def extra_volumes() -> str: return ( "$HOME/.config/gcloud/application_default_credentials.json:/root/.config/" "gcloud/application_default_credentials.json" @@ -32,7 +32,7 @@ def container_path() -> str: return "/source" -def test_run_data_explorer_local_base_path(host_path, container_path, credentials): +def test_run_data_explorer_local_base_path(host_path, container_path, extra_volumes): """Test that the data explorer can be run with a local base path.""" with patch("subprocess.call") as mock_call: run_explorer_app( @@ -40,7 +40,7 @@ def test_run_data_explorer_local_base_path(host_path, container_path, credential port=DEFAULT_PORT, container=DEFAULT_CONTAINER, tag=DEFAULT_TAG, - credentials=credentials, + extra_volumes=extra_volumes, ) mock_call.assert_called_once_with( [ @@ -54,7 +54,7 @@ def test_run_data_explorer_local_base_path(host_path, container_path, credential "-p", "8501:8501", "-v", - f"{credentials}:ro", + f"{extra_volumes}", "-v", f"{Path(host_path).resolve()}:{container_path}", f"{DEFAULT_CONTAINER}:{DEFAULT_TAG}", @@ -65,7 +65,7 @@ def test_run_data_explorer_local_base_path(host_path, container_path, credential ) -def test_run_data_explorer_remote_base_path(remote_path, credentials): +def test_run_data_explorer_remote_base_path(remote_path, extra_volumes): """Test that the data explorer can be run with a remote base path.""" with patch("subprocess.call") as mock_call: run_explorer_app( @@ -73,7 +73,7 @@ def test_run_data_explorer_remote_base_path(remote_path, credentials): port=DEFAULT_PORT, container=DEFAULT_CONTAINER, tag=DEFAULT_TAG, - credentials=credentials, + extra_volumes=extra_volumes, ) mock_call.assert_called_once_with( @@ -88,7 +88,7 @@ def test_run_data_explorer_remote_base_path(remote_path, credentials): "-p", "8501:8501", "-v", - f"{credentials}:ro", + f"{extra_volumes}", f"{DEFAULT_CONTAINER}:{DEFAULT_TAG}", "--base_path", f"{remote_path}", @@ -111,5 +111,5 @@ def test_invalid_run_data_explorer_remote_base_path(remote_path): port=DEFAULT_PORT, container=DEFAULT_CONTAINER, tag=DEFAULT_TAG, - credentials=None, + extra_volumes=None, )