Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[#3758] Improvement(PyGVFS): Support OAuth2 authentication in Python GVFS #5030

Merged
merged 9 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 43 additions & 3 deletions clients/client-python/gravitino/filesystem/gvfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
from gravitino.audit.fileset_audit_constants import FilesetAuditConstants
from gravitino.audit.fileset_data_operation import FilesetDataOperation
from gravitino.audit.internal_client_type import InternalClientType
from gravitino.auth.default_oauth2_token_provider import DefaultOAuth2TokenProvider
from gravitino.auth.oauth2_token_provider import OAuth2TokenProvider
from gravitino.auth.simple_auth_provider import SimpleAuthProvider
from gravitino.catalog.fileset_catalog import FilesetCatalog
from gravitino.client.gravitino_client import GravitinoClient
Expand Down Expand Up @@ -92,16 +94,41 @@ def __init__(
"""
self._metalake = metalake_name
auth_type = (
GVFSConfig.DEFAULT_AUTH_TYPE
GVFSConfig.SIMPLE_AUTH_TYPE
if options is None
else options.get(GVFSConfig.AUTH_TYPE, GVFSConfig.DEFAULT_AUTH_TYPE)
else options.get(GVFSConfig.AUTH_TYPE, GVFSConfig.SIMPLE_AUTH_TYPE)
)
if auth_type == GVFSConfig.DEFAULT_AUTH_TYPE:
if auth_type == GVFSConfig.SIMPLE_AUTH_TYPE:
self._client = GravitinoClient(
uri=server_uri,
metalake_name=metalake_name,
auth_data_provider=SimpleAuthProvider(),
)
elif auth_type == GVFSConfig.OAUTH2_AUTH_TYPE:
xloya marked this conversation as resolved.
Show resolved Hide resolved
oauth2_server_uri = options.get(GVFSConfig.OAUTH2_SERVER_URI)
self._check_auth_config(
auth_type, GVFSConfig.OAUTH2_SERVER_URI, oauth2_server_uri
)

oauth2_credential = options.get(GVFSConfig.OAUTH2_CREDENTIAL)
self._check_auth_config(
auth_type, GVFSConfig.OAUTH2_CREDENTIAL, oauth2_credential
)

oauth2_path = options.get(GVFSConfig.OAUTH2_PATH)
self._check_auth_config(auth_type, GVFSConfig.OAUTH2_PATH, oauth2_path)

oauth2_scope = options.get(GVFSConfig.OAUTH2_SCOPE)
self._check_auth_config(auth_type, GVFSConfig.OAUTH2_SCOPE, oauth2_scope)

oauth2_token_provider: OAuth2TokenProvider = DefaultOAuth2TokenProvider(
oauth2_server_uri, oauth2_credential, oauth2_path, oauth2_scope
)
self._client = GravitinoClient(
uri=server_uri,
metalake_name=metalake_name,
auth_data_provider=oauth2_token_provider,
)
else:
raise GravitinoRuntimeException(
f"Authentication type {auth_type} is not supported."
Expand Down Expand Up @@ -686,6 +713,19 @@ def _strip_storage_protocol(storage_type: StorageType, path: str):
f"Storage type:{storage_type} doesn't support now."
)

@staticmethod
def _check_auth_config(auth_type: str, config_key: str, config_value: str):
"""Check if the config value is null.
:param auth_type: The auth type
:param config_key: The config key
:param config_value: The config value
"""
if config_value is None:
raise GravitinoRuntimeException(
f"{config_key} should not be null"
f" if {GVFSConfig.AUTH_TYPE} is set to {auth_type}."
)

def _get_fileset_catalog(self, catalog_ident: NameIdentifier):
read_lock = self._catalog_cache_lock.gen_rlock()
try:
Expand Down
8 changes: 7 additions & 1 deletion clients/client-python/gravitino/filesystem/gvfs_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,10 @@ class GVFSConfig:
DEFAULT_CACHE_EXPIRED_TIME = 3600

AUTH_TYPE = "auth_type"
DEFAULT_AUTH_TYPE = "simple"
SIMPLE_AUTH_TYPE = "simple"

OAUTH2_AUTH_TYPE = "oauth2"
OAUTH2_SERVER_URI = "oauth2_server_uri"
OAUTH2_CREDENTIAL = "oauth2_credential"
OAUTH2_PATH = "oauth2_path"
OAUTH2_SCOPE = "oauth2_scope"
81 changes: 80 additions & 1 deletion clients/client-python/tests/unittests/test_gvfs_with_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,19 @@

from gravitino import gvfs, NameIdentifier
from gravitino.auth.auth_constants import AuthConstants
from gravitino.exceptions.base import GravitinoRuntimeException
from gravitino.exceptions.base import (
GravitinoRuntimeException,
IllegalArgumentException,
BadRequestException,
)
from gravitino.filesystem.gvfs_config import GVFSConfig
from tests.unittests import mock_base
from tests.unittests.auth.mock_base import (
mock_jwt,
GENERATED_TIME,
mock_authentication_with_error_authentication_type,
mock_authentication_invalid_grant_error,
)


def generate_unique_random_string(length):
Expand Down Expand Up @@ -106,6 +116,75 @@ def test_simple_auth(self, *mock_methods):
if current_user is not None:
os.environ["user.name"] = current_user

def test_oauth2_auth(self, *mock_methods):
fs_options = {
GVFSConfig.AUTH_TYPE: GVFSConfig.OAUTH2_AUTH_TYPE,
GVFSConfig.OAUTH2_SERVER_URI: "http://127.0.0.1:1082",
GVFSConfig.OAUTH2_CREDENTIAL: "xx:xx",
GVFSConfig.OAUTH2_SCOPE: "test",
GVFSConfig.OAUTH2_PATH: "token/test",
}
# test auth normally
mocked_jwt = mock_jwt(
sub="gravitino", exp=GENERATED_TIME + 10000, aud="service1"
)
with patch(
"gravitino.auth.default_oauth2_token_provider.DefaultOAuth2TokenProvider._get_access_token",
return_value=mocked_jwt,
), patch(
"gravitino.auth.default_oauth2_token_provider.DefaultOAuth2TokenProvider._fetch_token",
return_value=mocked_jwt,
):
fileset_storage_location = f"{self._fileset_dir}/test_oauth2_auth"
fileset_virtual_location = "fileset/fileset_catalog/tmp/test_oauth2_auth"
actual_path = fileset_storage_location
with patch(
"gravitino.catalog.fileset_catalog.FilesetCatalog.get_file_location",
return_value=actual_path,
):
local_fs = LocalFileSystem()
local_fs.mkdir(fileset_storage_location)
sub_dir_path = f"{fileset_storage_location}/test_1"
local_fs.mkdir(sub_dir_path)
self.assertTrue(local_fs.exists(sub_dir_path))
sub_file_path = f"{fileset_storage_location}/test_file_1.par"
local_fs.touch(sub_file_path)
self.assertTrue(local_fs.exists(sub_file_path))
fs = gvfs.GravitinoVirtualFileSystem(
server_uri="http://localhost:9090",
metalake_name="metalake_demo",
options=fs_options,
skip_instance_cache=True,
)
# should not raise exception
self.assertTrue(fs.exists(fileset_virtual_location))

# test error authentication type
with patch(
"gravitino.utils.http_client.HTTPClient.post_form",
return_value=mock_authentication_with_error_authentication_type(),
):
with self.assertRaises(IllegalArgumentException):
gvfs.GravitinoVirtualFileSystem(
server_uri="http://localhost:9090",
metalake_name="metalake_demo",
options=fs_options,
skip_instance_cache=True,
)

# test bad request
with patch(
"gravitino.utils.http_client.HTTPClient._make_request",
return_value=mock_authentication_invalid_grant_error(),
):
with self.assertRaises(BadRequestException):
gvfs.GravitinoVirtualFileSystem(
server_uri="http://localhost:9090",
metalake_name="metalake_demo",
options=fs_options,
skip_instance_cache=True,
)

def test_ls(self, *mock_methods):
fileset_storage_location = f"{self._fileset_dir}/test_ls"
fileset_virtual_location = "fileset/fileset_catalog/tmp/test_ls"
Expand Down
43 changes: 34 additions & 9 deletions docs/how-to-use-gvfs.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ license: "This software is licensed under the Apache License version 2."
directories, with `fileset` you can manage non-tabular data through Gravitino. For
details, you can read [How to manage fileset metadata using Gravitino](./manage-fileset-metadata-using-gravitino.md).

To use `Fileset` managed by Gravitino, Gravitino provides a virtual file system layer called
To use `fileset` managed by Gravitino, Gravitino provides a virtual file system layer called
the Gravitino Virtual File System (GVFS):
* In Java, it's built on top of the Hadoop Compatible File System(HCFS) interface.
* In Python, it's built on top of the [fsspec](https://filesystem-spec.readthedocs.io/en/stable/index.html)
Expand Down Expand Up @@ -335,13 +335,17 @@ to recompile the native libraries like `libhdfs` and others, and completely repl

### Configuration

| Configuration item | Description | Default value | Required | Since version |
|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------|---------------|----------|---------------|
| `server_uri` | The Gravitino server uri, e.g. `http://localhost:8090`. | (none) | Yes | 0.6.0 |. | (none) | Yes | 0.6.0 |
| `metalake_name` | The metalake name which the fileset belongs to. | (none) | Yes | 0.6.0 |. | (none) | Yes | 0.6.0 | . | (none) | Yes | 0.6.0 |
| `cache_size` | The cache capacity of the Gravitino Virtual File System. | `20` | No | 0.6.0 |. | (none) | Yes | 0.6.0 | . | (none) | Yes | 0.6.0 |
| `cache_expired_time` | The value of time that the cache expires after accessing in the Gravitino Virtual File System. The value is in `seconds`. | `3600` | No | 0.6.0 |.
| `auth_type` | The auth type to initialize the Gravitino client to use with the Gravitino Virtual File System. Currently only supports `simple` auth types. | `simple` | No | 0.6.0 |.
| Configuration item | Description | Default value | Required | Since version |
|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-----------------------------------|---------------|
| `server_uri` | The Gravitino server uri, e.g. `http://localhost:8090`. | (none) | Yes | 0.6.0 |. | (none) | Yes | 0.6.0 |
| `metalake_name` | The metalake name which the fileset belongs to. | (none) | Yes | 0.6.0 |. | (none) | Yes | 0.6.0 | . | (none) | Yes | 0.6.0 |
| `cache_size` | The cache capacity of the Gravitino Virtual File System. | `20` | No | 0.6.0 |. | (none) | Yes | 0.6.0 | . | (none) | Yes | 0.6.0 |
| `cache_expired_time` | The value of time that the cache expires after accessing in the Gravitino Virtual File System. The value is in `seconds`. | `3600` | No | 0.6.0 |.
| `auth_type` | The auth type to initialize the Gravitino client to use with the Gravitino Virtual File System. Currently supports `simple` and `oauth2` auth types. | `simple` | No | 0.6.0 |.
| `oauth2_server_uri` | The auth server URI for the Gravitino client when using `oauth2` auth type. | (none) | Yes if you use `oauth2` auth type | 0.7.0 |.
| `oauth2_credential` | The auth credential for the Gravitino client when using `oauth2` auth type. | (none) | Yes if you use `oauth2` auth type | 0.7.0 |.
| `oauth2_path` | The auth server path for the Gravitino client when using `oauth2` auth type. Please remove the first slash `/` from the path, for example `oauth/token`. | (none) | Yes if you use `oauth2` auth type | 0.7.0 |.
| `oauth2_scope` | The auth scope for the Gravitino client when using `oauth2` auth type with the Gravitino Virtual File System. | (none) | Yes if you use `oauth2` auth type | 0.7.0 |.


You can configure these properties when obtaining the `Gravitino Virtual FileSystem` in Python like this:
Expand Down Expand Up @@ -538,7 +542,7 @@ print(documents)

### Authentication

Currently, Gravitino Virtual File System in Python only supports one kind of authentication types to access Gravitino server: `simple`.
Currently, Gravitino Virtual File System in Python supports two kinds of authentication types to access Gravitino server: `simple` and `oauth2`.

The type of `simple` is the default authentication type in Gravitino Virtual File System in Python.

Expand All @@ -556,4 +560,25 @@ from gravitino import gvfs
options = {"auth_type": "simple"}
fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options)
print(fs.ls("gvfs://fileset/fileset_catlaog/tmp/test_fileset"))
```

##### Using `OAuth` authentication

First, make sure that your Gravitino server is also configured to use the `oauth2` authentication mode,
and you have an OAuth server to fetch the token: [Security](security/security.md).

Then, you can configure the authentication like this:

```python
from gravitino import gvfs

options = {
GVFSConfig.AUTH_TYPE: GVFSConfig.OAUTH2_AUTH_TYPE,
GVFSConfig.OAUTH2_SERVER_URI: "http://127.0.0.1:1082",
GVFSConfig.OAUTH2_CREDENTIAL: "xx:xx",
GVFSConfig.OAUTH2_SCOPE: "test",
GVFSConfig.OAUTH2_PATH: "token/test",
}
fs = gvfs.GravitinoVirtualFileSystem(server_uri="http://localhost:8090", metalake_name="test_metalake", options=options)
print(fs.ls("gvfs://fileset/fileset_catlaog/tmp/test_fileset"))
```
Loading