From 4742729cc45951e8e395e508d9ca4337120c8985 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Tue, 19 May 2020 17:05:13 +0100 Subject: [PATCH 01/21] Add dagster-azure package with various storage components This adds the following components based on Azure Data Lake Storage Gen2 (and Azure Blob Storage where appropriate): - ADLS2FileCache and adls2_file_cache - ADLS2FileManager - ADLS2IntermediateStore - ADLS2ObjectStore - the adls2_resource providing direct access to Azure Data Lake Storage - the adls2_system_storage system storage This is pretty similar to the S3 implementation, the main difference being configuration: Azure's SDK requires credentials to be passed explicitly, so the credential is expected in configuration. Tests currently require an access key to complete any tests marked 'nettest'. --- .../libraries/dagster-azure/.coveragerc | 2 + .../libraries/dagster-azure/LICENSE | 201 ++++++++ .../libraries/dagster-azure/MANIFEST.in | 5 + .../dagster-azure/dagster_azure/__init__.py | 5 + .../dagster_azure/adls2/__init__.py | 11 + .../adls2/adls2_fake_resource.py | 143 ++++++ .../dagster_azure/adls2/file_cache.py | 74 +++ .../dagster_azure/adls2/file_manager.py | 109 +++++ .../dagster_azure/adls2/intermediate_store.py | 38 ++ .../dagster_azure/adls2/object_store.py | 155 ++++++ .../dagster_azure/adls2/resources.py | 76 +++ .../dagster_azure/adls2/system_storage.py | 75 +++ .../dagster_azure/adls2/utils.py | 13 + .../dagster_azure/blob/__init__.py | 2 + .../dagster_azure/blob/blob_fake_resource.py | 164 +++++++ .../dagster_azure/blob/compute_log_manager.py | 200 ++++++++ .../dagster_azure/blob/intermediate_store.py | 41 ++ .../dagster_azure/blob/object_store.py | 138 ++++++ .../dagster-azure/dagster_azure/blob/utils.py | 15 + .../dagster-azure/dagster_azure/version.py | 3 + .../dagster_azure_tests/__init__.py | 0 .../adls2_tests/__init__.py | 0 .../adls2_tests/conftest.py | 16 + .../adls2_tests/test_adls2_file_cache.py | 60 +++ .../adls2_tests/test_adls2_file_manager.py | 199 ++++++++ .../adls2_tests/test_intermediate_store.py | 457 ++++++++++++++++++ .../adls2_tests/test_object_store.py | 37 ++ .../blob_tests/conftest.py | 16 + .../blob_tests/test_compute_log_manager.py | 127 +++++ .../dagster_azure_tests/test_version.py | 5 + .../dagster-azure/dev-requirements.txt | 0 .../libraries/dagster-azure/setup.cfg | 2 + .../libraries/dagster-azure/setup.py | 53 ++ .../libraries/dagster-azure/tox.ini | 24 + 34 files changed, 2466 insertions(+) create mode 100644 python_modules/libraries/dagster-azure/.coveragerc create mode 100644 python_modules/libraries/dagster-azure/LICENSE create mode 100644 python_modules/libraries/dagster-azure/MANIFEST.in create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/__init__.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/adls2_fake_resource.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/file_manager.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/intermediate_store.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/system_storage.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/blob/blob_fake_resource.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/blob/intermediate_store.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/blob/object_store.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/version.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure_tests/__init__.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/__init__.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/conftest.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure_tests/test_version.py create mode 100644 python_modules/libraries/dagster-azure/dev-requirements.txt create mode 100644 python_modules/libraries/dagster-azure/setup.cfg create mode 100644 python_modules/libraries/dagster-azure/setup.py create mode 100644 python_modules/libraries/dagster-azure/tox.ini diff --git a/python_modules/libraries/dagster-azure/.coveragerc b/python_modules/libraries/dagster-azure/.coveragerc new file mode 100644 index 0000000000000..398ff08afa472 --- /dev/null +++ b/python_modules/libraries/dagster-azure/.coveragerc @@ -0,0 +1,2 @@ +[run] +branch = True diff --git a/python_modules/libraries/dagster-azure/LICENSE b/python_modules/libraries/dagster-azure/LICENSE new file mode 100644 index 0000000000000..8dada3edaf50d --- /dev/null +++ b/python_modules/libraries/dagster-azure/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/python_modules/libraries/dagster-azure/MANIFEST.in b/python_modules/libraries/dagster-azure/MANIFEST.in new file mode 100644 index 0000000000000..480c841039671 --- /dev/null +++ b/python_modules/libraries/dagster-azure/MANIFEST.in @@ -0,0 +1,5 @@ +recursive-include dagster_azure *.sh +recursive-include dagster_azure *.yaml +recursive-include dagster_azure *.txt +recursive-include dagster_azure *.template +include LICENSE diff --git a/python_modules/libraries/dagster-azure/dagster_azure/__init__.py b/python_modules/libraries/dagster-azure/dagster_azure/__init__.py new file mode 100644 index 0000000000000..41b391e118771 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/__init__.py @@ -0,0 +1,5 @@ +from dagster.core.utils import check_dagster_package_version + +from .version import __version__ + +check_dagster_package_version('dagster-azure', __version__) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py new file mode 100644 index 0000000000000..c2246756a6250 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py @@ -0,0 +1,11 @@ +from .adls2_fake_resource import ADLS2FakeClient, create_adls2_fake_resource +from .file_cache import ADLS2FileCache, adls2_file_cache +from .file_manager import ADLS2FileHandle, ADLS2FileManager +from .intermediate_store import ADLS2IntermediateStore +from .object_store import ADLS2ObjectStore +from .resources import adls2_resource +from .system_storage import adls2_plus_default_storage_defs, adls2_system_storage +from .utils import create_adls2_client + +# from .compute_log_manager import ADLS2ComputeLogManager +# from .solids import ADLS2Coordinate, file_handle_to_adls2 diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/adls2_fake_resource.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/adls2_fake_resource.py new file mode 100644 index 0000000000000..25d187d37eb4d --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/adls2_fake_resource.py @@ -0,0 +1,143 @@ +from collections import defaultdict +from contextlib import contextmanager +import io +import random + +from azure.core.exceptions import ResourceNotFoundError + +from dagster.seven import mock + + +def create_adls2_fake_resource(account_name): + '''Create a mock ADLS2 client for test.''' + return ADLS2FakeClient(account_name, 'fake-creds') + + +class ADLS2FakeClient(object): + '''Stateful mock of an ADLS2 client for testing. + + Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict. + ''' + + def __init__(self, account_name, credential): + + self._account_name = account_name + self._credential = mock.MagicMock() + self._credential.account_key = credential + self._file_systems = {} + + @property + def account_name(self): + return self._account_name + + @property + def credential(self): + return self._credential + + @property + def file_systems(self): + return self._file_systems + + def get_file_system_client(self, file_system): + return self._file_systems.setdefault( + file_system, ADLS2FakeFilesystemClient(self.account_name, file_system) + ) + + def get_file_client(self, file_system, file_path): + return self.get_file_system_client(file_system).get_file_client(file_path) + + +class ADLS2FakeFilesystemClient(object): + '''Stateful mock of an ADLS2 filesystem client for testing.''' + + def __init__(self, account_name, file_system_name): + self._file_system = defaultdict(ADLS2FakeFileClient) + self._account_name = account_name + self._file_system_name = file_system_name + + @property + def account_name(self): + return self._account_name + + @property + def file_system_name(self): + return self._file_system_name + + def keys(self): + return self._file_system.keys() + + def get_file_system_properties(self): + return {"account_name": self.account_name, "file_system_name": self.file_system_name} + + def has_file(self, path): + return bool(self._file_system.get(path)) + + def get_file_client(self, file_path): + return self._file_system[file_path] + + def create_file(self, file): + return self._file_system[file] + + def delete_file(self, file): + for k in list(self._file_system.keys()): + if k.startswith(file): + del self._file_system[k] + + +class ADLS2FakeFileClient(object): + '''Stateful mock of an ADLS2 file client for testing.''' + + def __init__(self): + self.contents = None + self.lease = None + + def get_file_properties(self): + if self.contents is None: + raise ResourceNotFoundError("File does not exist!") + return {"lease": self.lease} + + def upload_data(self, contents, overwrite=False, lease=None): + if self.lease is not None: + if lease != self.lease: + raise Exception("Invalid lease!") + if self.contents is not None or overwrite is True: + if isinstance(contents, str): + self.contents = contents.encode('utf8') + elif isinstance(contents, io.BytesIO): + self.contents = contents.read() + elif isinstance(contents, io.StringIO): + self.contents = contents.read().encode('utf8') + elif isinstance(contents, bytes): + self.contents = contents + else: + print("Uploading unknown data") + self.contents = contents + + @contextmanager + def acquire_lease(self, lease_duration=-1): # pylint: disable=unused-argument + if self.lease is None: + self.lease = random.randint(0, 2 ** 9) + try: + yield self.lease + finally: + self.lease = None + else: + raise Exception("Lease already held") + + def download_file(self): + if self.contents is None: + raise ResourceNotFoundError("File does not exist!") + return ADLS2FakeFileDownloader(contents=self.contents) + + +class ADLS2FakeFileDownloader(object): + '''Mock of an ADLS2 file downloader for testing.''' + + def __init__(self, contents): + self.contents = contents + + def readall(self): + return self.contents + + def readinto(self, fileobj): + fileobj.write(self.contents) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py new file mode 100644 index 0000000000000..6468039dd86cc --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py @@ -0,0 +1,74 @@ +from azure.core.exceptions import ResourceNotFoundError + +from dagster import Field, String, StringSource, Selector, check, resource +from dagster.core.storage.file_cache import FileCache + +from .file_manager import ADLS2FileHandle +from .utils import create_adls2_client + + +class ADLS2FileCache(FileCache): + def __init__( + self, storage_account, file_system, prefix, credential=None, overwrite=False, client=None + ): + super(ADLS2FileCache, self).__init__(overwrite=overwrite) + + self.storage_account = storage_account + self.file_system = file_system + self.prefix = prefix + + self.client = client or create_adls2_client(storage_account, credential) + + def has_file_object(self, file_key): + check.str_param(file_key, 'file_key') + try: + file = self.client.get_file_client(self.file_system, self.get_full_key(file_key)) + file.get_file_properties() + except ResourceNotFoundError: + return False + return True + + def get_full_key(self, file_key): + return '{base_key}/{file_key}'.format(base_key=self.prefix, file_key=file_key) + + def write_file_object(self, file_key, source_file_object): + check.str_param(file_key, 'file_key') + + adls2_key = self.get_full_key(file_key) + adls2_file = self.client.get_file_client(file_system=self.file_system, file_path=adls2_key) + adls2_file.upload_data(source_file_object, overwrite=True) + return self.get_file_handle(file_key) + + def get_file_handle(self, file_key): + check.str_param(file_key, 'file_key') + return ADLS2FileHandle( + self.client.account_name, self.file_system, self.get_full_key(file_key) + ) + + +@resource( + { + 'storage_account': Field(String, description='The storage account name.'), + 'credential': Field( + Selector( + { + 'sas': Field(StringSource, description='SAS token for the account.'), + 'key': Field(StringSource, description='Shared Access Key for the account'), + } + ), + description='The credentials with which to authenticate.', + ), + 'prefix': Field(String, description='The base path prefix to use in ADLS2'), + 'file_system': Field(String, description='The storage account filesystem (aka container)'), + 'overwrite': Field(bool, is_required=False, default_value=False), + } +) +def adls2_file_cache(init_context): + return ADLS2FileCache( + storage_account=init_context.resource_config['storage_account'], + file_system=init_context.resource_config['file_system'], + prefix=init_context.resource_config['prefix'], + credential=init_context.resource_config['credential'], + overwrite=init_context.resource_config['overwrite'], + # TODO: resource dependencies + ) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_manager.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_manager.py new file mode 100644 index 0000000000000..a63bbeb5b0655 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_manager.py @@ -0,0 +1,109 @@ +import io +import uuid +from contextlib import contextmanager + +from dagster import check, usable_as_dagster_type +from dagster.core.storage.file_manager import ( + FileHandle, + FileManager, + TempfileManager, + check_file_like_obj, +) + + +@usable_as_dagster_type +class ADLS2FileHandle(FileHandle): + def __init__(self, account, file_system, key): + self._account = check.str_param(account, 'account') + self._file_system = check.str_param(file_system, 'file_system') + self._key = check.str_param(key, 'key') + + @property + def account(self): + return self._account + + @property + def file_system(self): + return self._file_system + + @property + def key(self): + return self._key + + @property + def path_desc(self): + return self.adls2_path + + @property + def adls2_path(self): + return 'adfss://{file_system}@{account}.dfs.core.windows.net/{key}'.format( + file_system=self.file_system, account=self.account, key=self.key, + ) + + +class ADLS2FileManager(FileManager): + def __init__(self, adls2_client, file_system, prefix): + self._client = adls2_client + self._file_system = check.str_param(file_system, 'file_system') + self._prefix = check.str_param(prefix, 'prefix') + self._local_handle_cache = {} + self._temp_file_manager = TempfileManager() + + def copy_handle_to_local_temp(self, file_handle): + self._download_if_not_cached(file_handle) + return self._get_local_path(file_handle) + + def _download_if_not_cached(self, file_handle): + if not self._file_handle_cached(file_handle): + # instigate download + temp_file_obj = self._temp_file_manager.tempfile() + temp_name = temp_file_obj.name + file = self._client.get_file_client( + file_system=file_handle.file_system, file_path=file_handle.key, + ) + download = file.download_file() + with open(temp_name, 'wb') as file_obj: + download.readinto(file_obj) + self._local_handle_cache[file_handle.adls2_path] = temp_name + + return file_handle + + @contextmanager + def read(self, file_handle, mode='rb'): + check.inst_param(file_handle, 'file_handle', ADLS2FileHandle) + check.str_param(mode, 'mode') + check.param_invariant(mode in {'r', 'rb'}, 'mode') + + self._download_if_not_cached(file_handle) + + with open(self._get_local_path(file_handle), mode) as file_obj: + yield file_obj + + def _file_handle_cached(self, file_handle): + return file_handle.adls2_path in self._local_handle_cache + + def _get_local_path(self, file_handle): + return self._local_handle_cache[file_handle.adls2_path] + + def read_data(self, file_handle): + with self.read(file_handle, mode='rb') as file_obj: + return file_obj.read() + + def write_data(self, data, ext=None): + check.inst_param(data, 'data', bytes) + return self.write(io.BytesIO(data), mode='wb', ext=ext) + + def write(self, file_obj, mode='wb', ext=None): # pylint: disable=unused-argument + check_file_like_obj(file_obj) + adls2_key = self.get_full_key(str(uuid.uuid4()) + (('.' + ext) if ext is not None else '')) + adls2_file = self._client.get_file_client( + file_system=self._file_system, file_path=adls2_key + ) + adls2_file.upload_data(file_obj, overwrite=True) + return ADLS2FileHandle(self._client.account_name, self._file_system, adls2_key) + + def get_full_key(self, file_key): + return '{base_key}/{file_key}'.format(base_key=self._prefix, file_key=file_key) + + def delete_local_temp(self): + self._temp_file_manager.close() diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/intermediate_store.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/intermediate_store.py new file mode 100644 index 0000000000000..2c2fc5cb0610c --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/intermediate_store.py @@ -0,0 +1,38 @@ +from dagster import check +from dagster.core.storage.intermediate_store import IntermediateStore +from dagster.core.storage.type_storage import TypeStoragePluginRegistry + +from .object_store import ADLS2ObjectStore + + +class ADLS2IntermediateStore(IntermediateStore): + '''Intermediate store using Azure Data Lake Storage Gen2. + + This intermediate store uses ADLS2 APIs to communicate with the storage, + which are better optimised for various tasks than regular Blob storage. + ''' + + def __init__( + self, file_system, run_id, client, type_storage_plugin_registry=None, prefix='dagster', + ): + check.str_param(file_system, 'file_system') + check.str_param(prefix, 'prefix') + check.str_param(run_id, 'run_id') + + object_store = ADLS2ObjectStore(file_system, client) + + def root_for_run_id(r_id): + return object_store.key_for_paths([prefix, 'storage', r_id]) + + super(ADLS2IntermediateStore, self).__init__( + object_store, + root_for_run_id=root_for_run_id, + run_id=run_id, + type_storage_plugin_registry=check.inst_param( + type_storage_plugin_registry + if type_storage_plugin_registry + else TypeStoragePluginRegistry(types_to_register=[]), + 'type_storage_plugin_registry', + TypeStoragePluginRegistry, + ), + ) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py new file mode 100644 index 0000000000000..4856513e2eb2c --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py @@ -0,0 +1,155 @@ +import logging +import re +import sys +from io import BytesIO, StringIO + +from azure.core.exceptions import ResourceNotFoundError + +from dagster import check +from dagster.core.definitions.events import ObjectStoreOperation, ObjectStoreOperationType +from dagster.core.storage.object_store import ObjectStore +from dagster.core.types.marshal import SerializationStrategy +from dagster.seven import urlparse + +from dagster_azure.blob.utils import create_blob_client + +DEFAULT_LEASE_DURATION = 60 # One minute + + +class ADLS2ObjectStore(ObjectStore): + def __init__(self, file_system, client, lease_duration=DEFAULT_LEASE_DURATION): + self.adls2_client = client + self.file_system_client = self.adls2_client.get_file_system_client(file_system) + # We also need a blob client to handle copying as ADLS doesn't have a copy API yet + self.blob_client = create_blob_client( + client.account_name, + # client.credential is non-null if a secret key was used to authenticate + client.credential.account_key if client.credential is not None + # otherwise the SAS token will be in the query string of the URL + else urlparse(client.url).query, + ) + self.blob_container_client = self.blob_client.get_container_client(file_system) + + self.lease_duration = lease_duration + self.file_system_client.get_file_system_properties() + super(ADLS2ObjectStore, self).__init__('adls2', sep='/') + + def set_object(self, key, obj, serialization_strategy=None): + check.str_param(key, 'key') + + logging.info('Writing ADLS2 object at: ' + self.uri_for_key(key)) + + # cannot check obj since could be arbitrary Python object + check.inst_param( + serialization_strategy, 'serialization_strategy', SerializationStrategy + ) # cannot be none here + + if self.has_object(key): + logging.warning('Removing existing ADLS2 key: {key}'.format(key=key)) + self.rm_object(key) + + file = self.file_system_client.create_file(key) + with file.acquire_lease(self.lease_duration) as lease: + with BytesIO() as bytes_io: + if serialization_strategy.write_mode == 'w' and sys.version_info >= (3, 0): + with StringIO() as string_io: + string_io = StringIO() + serialization_strategy.serialize(obj, string_io) + string_io.seek(0) + bytes_io.write(string_io.read().encode('utf-8')) + else: + serialization_strategy.serialize(obj, bytes_io) + bytes_io.seek(0) + file.upload_data(bytes_io, lease=lease, overwrite=True) + + return ObjectStoreOperation( + op=ObjectStoreOperationType.SET_OBJECT, + key=self.uri_for_key(key), + dest_key=None, + obj=obj, + serialization_strategy_name=serialization_strategy.name, + object_store_name=self.name, + ) + + def get_object(self, key, serialization_strategy=None): + check.str_param(key, 'key') + check.param_invariant(len(key) > 0, 'key') + check.inst_param( + serialization_strategy, 'serialization_strategy', SerializationStrategy + ) # cannot be none here + + # FIXME we need better error handling for object store + file = self.file_system_client.get_file_client(key) + stream = file.download_file() + obj = serialization_strategy.deserialize( + BytesIO(stream.readall()) + if serialization_strategy.read_mode == 'rb' + else StringIO(stream.readall().decode(serialization_strategy.encoding)) + ) + return ObjectStoreOperation( + op=ObjectStoreOperationType.GET_OBJECT, + key=self.uri_for_key(key), + dest_key=None, + obj=obj, + serialization_strategy_name=serialization_strategy.name, + object_store_name=self.name, + ) + + def has_object(self, key): + check.str_param(key, 'key') + check.param_invariant(len(key) > 0, 'key') + + try: + file = self.file_system_client.get_file_client(key) + file.get_file_properties() + return True + except ResourceNotFoundError: + return False + + def rm_object(self, key): + check.str_param(key, 'key') + check.param_invariant(len(key) > 0, 'key') + + # This operates recursively already so is nice and simple. + self.file_system_client.delete_file(key) + + return ObjectStoreOperation( + op=ObjectStoreOperationType.RM_OBJECT, + key=self.uri_for_key(key), + dest_key=None, + obj=None, + serialization_strategy_name=None, + object_store_name=self.name, + ) + + def cp_object(self, src, dst): + check.str_param(src, 'src') + check.str_param(dst, 'dst') + + # Manually recurse and copy anything that looks like a file. + for src_blob_properties in self.blob_container_client.list_blobs(src): + # This is the only way I can find to identify a 'directory' + if src_blob_properties["content_settings"] is None: + # Ignore this blob + continue + src_blob = self.blob_container_client.get_blob_client(src_blob_properties["name"]) + new_blob_path = re.sub(r'^{}'.format(src), dst, src_blob_properties["name"]) + new_blob = self.blob_container_client.get_blob_client(new_blob_path) + new_blob.start_copy_from_url(src_blob.url) + + return ObjectStoreOperation( + op=ObjectStoreOperationType.CP_OBJECT, + key=self.uri_for_key(src), + dest_key=self.uri_for_key(dst), + object_store_name=self.name, + ) + + def uri_for_key(self, key, protocol=None): + check.str_param(key, 'key') + protocol = check.opt_str_param(protocol, 'protocol', default='abfss://') + return '{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}'.format( + protocol=protocol, + filesystem=self.file_system_client.file_system_name, + account=self.file_system_client.account_name, + key=key, + ) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py new file mode 100644 index 0000000000000..7a51a4b4adb6d --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py @@ -0,0 +1,76 @@ +from dagster import Field, Selector, String, StringSource, resource + +from .utils import create_adls2_client + + +@resource( + { + 'storage_account': Field(String, description='The storage account name.'), + 'credential': Field( + Selector( + { + 'sas': Field(StringSource, description='SAS token for the account.'), + 'key': Field(StringSource, description='Shared Access Key for the account'), + } + ), + description='The credentials with which to authenticate.', + ), + } +) +def adls2_resource(context): + '''Resource that gives solids access to Azure Data Lake Storage Gen2. + + The underlying client is a :py:class:`~azure.storage.filedatalake.DataLakeServiceClient`. + + Attach this resource definition to a :py:class:`~dagster.ModeDefinition` in order to make it + available to your solids. + + Example: + + .. code-block:: python + + from dagster import ModeDefinition, execute_solid, solid + from dagster_azure.adls2 import adls2_resource + + @solid(required_resource_keys={'adls2'}) + def example_adls2_solid(context): + return list(context.resources.adls2.list_file_systems()) + + result = execute_solid( + example_adls2_solid, + environment_dict={ + 'resources': { + 'adls2': { + 'config': { + 'storage_account': 'my_storage_account' + } + } + } + }, + mode_def=ModeDefinition(resource_defs={'adls2': adls2_resource}), + ) + + Note that your solids must also declare that they require this resource with + `required_resource_keys`, or it will not be initialized for the execution of their compute + functions. + + You may pass credentials to this resource using either a SAS token or a key, using + environment variables if desired: + + .. code-block:: YAML + + resources: + adls2: + config: + storage_account: my_storage_account + # str: The storage account name. + credential: + sas: my_sas_token + # str: the SAS token for the account. + key: + env: AZURE_DATA_LAKE_STORAGE_KEY + # str: The shared access key for the account. + ''' + storage_account = context.resource_config['storage_account'] + credential = context.resource_config["credential"].copy().popitem()[1] + return create_adls2_client(storage_account, credential) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/system_storage.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/system_storage.py new file mode 100644 index 0000000000000..edde88d383c7f --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/system_storage.py @@ -0,0 +1,75 @@ +from dagster import Field, String, SystemStorageData, system_storage +from dagster.core.storage.intermediates_manager import IntermediateStoreIntermediatesManager +from dagster.core.storage.system_storage import fs_system_storage, mem_system_storage + +from .file_manager import ADLS2FileManager +from .intermediate_store import ADLS2IntermediateStore + + +@system_storage( + name='adls2', + is_persistent=True, + config={ + 'adls2_file_system': Field(String, description='ADLS Gen2 file system name'), + 'adls2_prefix': Field(String, is_required=False, default_value='dagster'), + }, + required_resource_keys={'adls2'}, +) +def adls2_system_storage(init_context): + '''Persistent system storage using Azure Data Lake Storage Gen2 for storage. + + Suitable for intermediates storage for distributed executors, so long as + each execution node has network connectivity and credentials for ADLS and + the backing container. + + Attach this system storage definition, as well as the :py:data:`~dagster_azure.adls2_resource` + it requires, to a :py:class:`~dagster.ModeDefinition` in order to make it available to your + pipeline: + + .. code-block:: python + + pipeline_def = PipelineDefinition( + mode_defs=[ + ModeDefinition( + resource_defs={'adls2': adls2_resource, ...}, + system_storage_defs=default_system_storage_defs + [adls2_system_storage, ...], + ... + ), ... + ], ... + ) + + You may configure this storage as follows: + + .. code-block:: YAML + + storage: + adls2: + config: + adls2_sa: my-best-storage-account + adls2_file_system: my-cool-file-system + adls2_prefix: good/prefix-for-files- + ''' + client = init_context.resources.adls2 + adls2_base = '{prefix}/storage/{run_id}/files'.format( + prefix=init_context.system_storage_config['adls2_prefix'], + run_id=init_context.pipeline_run.run_id, + ) + return SystemStorageData( + file_manager=ADLS2FileManager( + adls2_client=client, + file_system=init_context.system_storage_config['adls2_file_system'], + prefix=adls2_base, + ), + intermediates_manager=IntermediateStoreIntermediatesManager( + ADLS2IntermediateStore( + client=client, + file_system=init_context.system_storage_config['adls2_file_system'], + prefix=init_context.system_storage_config['adls2_prefix'], + run_id=init_context.pipeline_run.run_id, + type_storage_plugin_registry=init_context.type_storage_plugin_registry, + ) + ), + ) + + +adls2_plus_default_storage_defs = [mem_system_storage, fs_system_storage, adls2_system_storage] diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py new file mode 100644 index 0000000000000..927bde23c10a7 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py @@ -0,0 +1,13 @@ +from azure.storage.filedatalake import DataLakeServiceClient + + +def _create_url(storage_account, subdomain): + return "https://{}.{}.core.windows.net/".format(storage_account, subdomain) + + +def create_adls2_client(storage_account, credential): + """ + Create an ADLS2 client. + """ + account_url = _create_url(storage_account, "dfs") + return DataLakeServiceClient(account_url, credential) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py new file mode 100644 index 0000000000000..9a1d43bfbf9fe --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py @@ -0,0 +1,2 @@ +from .blob_fake_resource import create_blob_fake_resource +from .compute_log_manager import AzureBlobComputeLogManager diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/blob_fake_resource.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/blob_fake_resource.py new file mode 100644 index 0000000000000..2634b56913533 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/blob_fake_resource.py @@ -0,0 +1,164 @@ +from collections import defaultdict +from contextlib import contextmanager +import io +import random + +from azure.core.exceptions import ResourceNotFoundError + +from dagster.seven import mock + + +def create_blob_fake_resource(account_name): + '''Create a mock Blob client for test.''' + return BlobFakeClient(account_name, 'fake-creds') + + +class BlobFakeClient(object): + '''Stateful mock of an Blob client for testing. + + Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict. + ''' + + def __init__(self, account_name, credential): + + self._account_name = account_name + self._credential = mock.MagicMock() + self._credential.account_key = credential + self._containers = {} + + @property + def account_name(self): + return self._account_name + + @property + def credential(self): + return self._credential + + @property + def containers(self): + return self._containers + + def get_container_client(self, container): + return self._containers.setdefault( + container, BlobFakeContainerClient(self.account_name, container) + ) + + def get_blob_client(self, container, blob): + return self.get_container_client(container).get_blob_client(blob) + + +class BlobFakeContainerClient(object): + '''Stateful mock of an Blob container client for testing.''' + + def __init__(self, account_name, container_name): + self._container = defaultdict(BlobFakeBlobClient) + self._account_name = account_name + self._container_name = container_name + + @property + def account_name(self): + return self._account_name + + @property + def container_name(self): + return self._container_name + + def keys(self): + return self._container.keys() + + def get_container_properties(self): + return {"account_name": self.account_name, "container_name": self.container_name} + + def has_blob(self, path): + return bool(self._container.get(path)) + + def get_blob_client(self, blob): + return self._container[blob] + + def create_blob(self, blob): + return self._container[blob] + + def list_blobs(self, name_starts_with=None): + for k, v in self._container.items(): + if name_starts_with is None or k.startswith(name_starts_with): + yield { + 'name': k, + # This clearly isn't actually the URL but we need a way of copying contents + # across blobs and this allows us to do it + 'url': v.contents, + } + + def delete_blob(self, blob): + # Use list to avoid mutating dict as we iterate + for k in list(self._container.keys()): + if k.startswith(blob): + del self._container[k] + + +class BlobFakeBlobClient(object): + '''Stateful mock of an Blob blob client for testing.''' + + def __init__(self): + self.contents = None + self.lease = None + + def start_copy_from_url(self, url): + self.contents = url + + def get_blob_properties(self): + if self.contents is None: + raise ResourceNotFoundError("File does not exist!") + return {"lease": self.lease} + + def upload_blob(self, contents, overwrite=False, lease=None): + if self.lease is not None: + if lease != self.lease: + raise Exception("Invalid lease!") + if self.contents is None or overwrite is True: + if isinstance(contents, str): + self.contents = contents.encode('utf8') + elif isinstance(contents, io.TextIOBase): + self.contents = contents.read().encode('utf8') + elif isinstance(contents, io.IOBase): + self.contents = contents.read() + elif isinstance(contents, bytes): + self.contents = contents + # Python 2 compatibility - no base class for `file` type + elif hasattr(contents, 'read'): + self.contents = contents.read() + else: + print("Uploading unknown data") + self.contents = contents + + @property + def url(self): + return ':memory:' + + @contextmanager + def acquire_lease(self, lease_duration=-1): # pylint: disable=unused-argument + if self.lease is None: + self.lease = random.randint(0, 2 ** 9) + try: + yield self.lease + finally: + self.lease = None + else: + raise Exception("Lease already held") + + def download_blob(self): + if self.contents is None: + raise ResourceNotFoundError("File does not exist!") + return BlobFakeBlobDownloader(contents=self.contents) + + +class BlobFakeBlobDownloader(object): + '''Mock of a Blob file downloader for testing.''' + + def __init__(self, contents): + self.contents = contents + + def readall(self): + return self.contents + + def readinto(self, fileobj): + fileobj.write(self.contents) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py new file mode 100644 index 0000000000000..adf53296dfe06 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py @@ -0,0 +1,200 @@ +import itertools +import os +from contextlib import contextmanager + +from azure.storage.blob import generate_blob_sas +from dagster import Field, check, seven +from dagster.core.storage.compute_log_manager import ( + MAX_BYTES_FILE_READ, + ComputeIOType, + ComputeLogFileData, + ComputeLogManager, +) +from dagster.core.storage.local_compute_log_manager import IO_TYPE_EXTENSION, LocalComputeLogManager +from dagster.serdes import ConfigurableClass, ConfigurableClassData +from dagster.utils import ensure_dir, ensure_file + +from .utils import create_blob_client + + +class AzureBlobComputeLogManager(ComputeLogManager, ConfigurableClass): + '''Logs solid compute function stdout and stderr to Azure Blob Storage. + + This is also compatible with Azure Data Lake Storage. + + Users should not instantiate this class directly. Instead, use a YAML block in ``dagster.yaml`` + such as the following: + + .. code-block:: YAML + + compute_logs: + module: dagster_azure.blob.compute_log_manager + class: AzureBlobComputeLogManager + config: + storage_account: my-storage-account + container: my-container + credential: sas-token-or-secret-key + prefix: "dagster-test-" + local_dir: "/tmp/cool" + + Args: + storage_account (str): The storage account name to which to log. + container (str): The container (or ADLS2 filesystem) to which to log. + secret_key (str): Secret key for the storage account. SAS tokens are not + supported because we need a secret key to generate a SAS token for a download URL. + local_dir (Optional[str]): Path to the local directory in which to stage logs. Default: + ``dagster.seven.get_system_temp_directory()``. + prefix (Optional[str]): Prefix for the log file keys. + inst_data (Optional[ConfigurableClassData]): Serializable representation of the compute + log manager when newed up from config. + ''' + + def __init__( + self, + storage_account, + container, + secret_key, + local_dir=None, + inst_data=None, + prefix='dagster', + ): + self._storage_account = check.str_param(storage_account, 'storage_account') + self._container = check.str_param(container, 'container') + self._blob_prefix = check.str_param(prefix, 'prefix') + check.str_param(secret_key, 'secret_key') + + self._blob_client = create_blob_client(storage_account, secret_key) + self._container_client = self._blob_client.get_container_client(container) + self._download_urls = {} + + # proxy calls to local compute log manager (for subscriptions, etc) + if not local_dir: + local_dir = seven.get_system_temp_directory() + + self.local_manager = LocalComputeLogManager(local_dir) + self._inst_data = check.opt_inst_param(inst_data, 'inst_data', ConfigurableClassData) + + @contextmanager + def _watch_logs(self, pipeline_run, step_key=None): + # proxy watching to the local compute log manager, interacting with the filesystem + with self.local_manager._watch_logs( # pylint: disable=protected-access + pipeline_run, step_key + ): + yield + + @property + def inst_data(self): + return self._inst_data + + @classmethod + def config_type(cls): + return { + 'storage_account': str, + 'container': str, + 'secret_key': str, + 'local_dir': Field(str, is_required=False), + 'prefix': Field(str, is_required=False, default_value='dagster'), + } + + @staticmethod + def from_config_value(inst_data, config_value): + return AzureBlobComputeLogManager(inst_data=inst_data, **config_value) + + def get_local_path(self, run_id, key, io_type): + return self.local_manager.get_local_path(run_id, key, io_type) + + def on_watch_start(self, pipeline_run, step_key): + self.local_manager.on_watch_start(pipeline_run, step_key) + + def on_watch_finish(self, pipeline_run, step_key): + self.local_manager.on_watch_finish(pipeline_run, step_key) + key = self.local_manager.get_key(pipeline_run, step_key) + self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDOUT) + self._upload_from_local(pipeline_run.run_id, key, ComputeIOType.STDERR) + + def is_watch_completed(self, run_id, key): + return self.local_manager.is_watch_completed(run_id, key) + + def download_url(self, run_id, key, io_type): + if not self.is_watch_completed(run_id, key): + return self.local_manager.download_url(run_id, key, io_type) + key = self._blob_key(run_id, key, io_type) + if key in self._download_urls: + return self._download_urls[key] + blob = self._container_client.get_blob_client(key) + sas = generate_blob_sas( + self._storage_account, + self._container, + key, + account_key=self._blob_client.credential.account_key, + ) + url = blob.url + sas + self._download_urls[key] = url + return url + + def read_logs_file(self, run_id, key, io_type, cursor=0, max_bytes=MAX_BYTES_FILE_READ): + if self._should_download(run_id, key, io_type): + self._download_to_local(run_id, key, io_type) + data = self.local_manager.read_logs_file(run_id, key, io_type, cursor, max_bytes) + return self._from_local_file_data(run_id, key, io_type, data) + + def on_subscribe(self, subscription): + self.local_manager.on_subscribe(subscription) + + def _should_download(self, run_id, key, io_type): + local_path = self.get_local_path(run_id, key, io_type) + if os.path.exists(local_path): + return False + blob_objects = self._container_client.list_blobs(self._blob_key(run_id, key, io_type)) + # Limit the generator to avoid paging since we only need one element + # to return True + limited_blob_objects = itertools.islice(blob_objects, 1) + return len(list(limited_blob_objects)) > 0 + + def _from_local_file_data(self, run_id, key, io_type, local_file_data): + is_complete = self.is_watch_completed(run_id, key) + path = ( + 'https://{account}.blob.core.windows.net/{container}/{key}'.format( + account=self._storage_account, + container=self._container, + key=self._blob_key(run_id, key, io_type), + ) + if is_complete + else local_file_data.path + ) + + return ComputeLogFileData( + path, + local_file_data.data, + local_file_data.cursor, + local_file_data.size, + self.download_url(run_id, key, io_type), + ) + + def _upload_from_local(self, run_id, key, io_type): + path = self.get_local_path(run_id, key, io_type) + ensure_file(path) + key = self._blob_key(run_id, key, io_type) + with open(path, 'rb') as data: + blob = self._container_client.get_blob_client(key) + blob.upload_blob(data) + + def _download_to_local(self, run_id, key, io_type): + path = self.get_local_path(run_id, key, io_type) + ensure_dir(os.path.dirname(path)) + key = self._blob_key(run_id, key, io_type) + with open(path, 'wb') as fileobj: + blob = self._container_client.get_blob_client(key) + blob.download_blob().readinto(fileobj) + + def _blob_key(self, run_id, key, io_type): + check.inst_param(io_type, 'io_type', ComputeIOType) + extension = IO_TYPE_EXTENSION[io_type] + paths = [ + self._blob_prefix, + 'storage', + run_id, + 'compute_logs', + '{}.{}'.format(key, extension), + ] + return '/'.join(paths) # blob path delimiter diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/intermediate_store.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/intermediate_store.py new file mode 100644 index 0000000000000..603f3d6863ce3 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/intermediate_store.py @@ -0,0 +1,41 @@ +from dagster import check +from dagster.core.storage.intermediate_store import IntermediateStore +from dagster.core.storage.type_storage import TypeStoragePluginRegistry + +from .object_store import AzureBlobObjectStore + + +class AzureBlobIntermediateStore(IntermediateStore): + '''Intermediate store using Azure Blob storage. + + If your storage account has the ADLS Gen2 hierarchical namespace enabled + this should still work, but it is recommended to use the + :py:class:`~dagster_azure.adls2.intermediate_store.ADLS2IntermediateStore` + instead, which will enable some optimizations for certain types (notably + PySpark DataFrames). + ''' + + def __init__( + self, container, run_id, client, type_storage_plugin_registry=None, prefix='dagster', + ): + check.str_param(container, 'container') + check.str_param(prefix, 'prefix') + check.str_param(run_id, 'run_id') + + object_store = AzureBlobObjectStore(container, client) + + def root_for_run_id(r_id): + return object_store.key_for_paths([prefix, 'storage', r_id]) + + super(AzureBlobIntermediateStore, self).__init__( + object_store, + root_for_run_id=root_for_run_id, + run_id=run_id, + type_storage_plugin_registry=check.inst_param( + type_storage_plugin_registry + if type_storage_plugin_registry + else TypeStoragePluginRegistry(types_to_register=[]), + 'type_storage_plugin_registry', + TypeStoragePluginRegistry, + ), + ) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/object_store.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/object_store.py new file mode 100644 index 0000000000000..be6729cdc09b3 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/object_store.py @@ -0,0 +1,138 @@ +import logging +import re +import sys +from io import BytesIO, StringIO + +from azure.core.exceptions import ResourceNotFoundError + +from dagster import check +from dagster.core.definitions.events import ObjectStoreOperation, ObjectStoreOperationType +from dagster.core.storage.object_store import ObjectStore +from dagster.core.types.marshal import SerializationStrategy + +DEFAULT_LEASE_DURATION = 60 * 60 # One hour + + +class AzureBlobObjectStore(ObjectStore): + def __init__(self, container, client, lease_duration=DEFAULT_LEASE_DURATION): + self.blob_client = client + self.container_client = self.blob_client.get_container_client(container) + + self.lease_duration = lease_duration + self.container_client.get_container_properties() + super(AzureBlobObjectStore, self).__init__('azure-blob', sep='/') + + def set_object(self, key, obj, serialization_strategy=None): + check.str_param(key, 'key') + + logging.info('Writing Azure Blob object at: ' + self.uri_for_key(key)) + + # cannot check obj since could be arbitrary Python object + check.inst_param( + serialization_strategy, 'serialization_strategy', SerializationStrategy + ) # cannot be none here + + blob = self.container_client.create_blob(key) + with blob.acquire_lease(self.lease_duration) as lease: + with BytesIO() as bytes_io: + if serialization_strategy.write_mode == 'w' and sys.version_info >= (3, 0): + with StringIO() as string_io: + string_io = StringIO() + serialization_strategy.serialize(obj, string_io) + string_io.seek(0) + bytes_io.write(string_io.read().encode('utf-8')) + else: + serialization_strategy.serialize(obj, bytes_io) + bytes_io.seek(0) + blob.upload_blob(bytes_io, lease=lease, overwrite=True) + + return ObjectStoreOperation( + op=ObjectStoreOperationType.SET_OBJECT, + key=self.uri_for_key(key), + dest_key=None, + obj=obj, + serialization_strategy_name=serialization_strategy.name, + object_store_name=self.name, + ) + + def get_object(self, key, serialization_strategy=None): + check.str_param(key, 'key') + check.param_invariant(len(key) > 0, 'key') + check.inst_param( + serialization_strategy, 'serialization_strategy', SerializationStrategy + ) # cannot be none here + + # FIXME we need better error handling for object store + blob = self.container_client.download_blob(key) + obj = serialization_strategy.deserialize( + BytesIO(blob.readall()) + if serialization_strategy.read_mode == 'rb' + else StringIO(blob.readall().decode(serialization_strategy.encoding)) + ) + return ObjectStoreOperation( + op=ObjectStoreOperationType.GET_OBJECT, + key=self.uri_for_key(key), + dest_key=None, + obj=obj, + serialization_strategy_name=serialization_strategy.name, + object_store_name=self.name, + ) + + def has_object(self, key): + check.str_param(key, 'key') + check.param_invariant(len(key) > 0, 'key') + + try: + blob = self.container_client.get_blob_client(key) + blob.get_blob_properties() + return True + except ResourceNotFoundError: + return False + + def rm_object(self, key): + check.str_param(key, 'key') + check.param_invariant(len(key) > 0, 'key') + + for blob in self.container_client.list_blobs(key): + self.container_client.delete_blob(blob) + + return ObjectStoreOperation( + op=ObjectStoreOperationType.RM_OBJECT, + key=self.uri_for_key(key), + dest_key=None, + obj=None, + serialization_strategy_name=None, + object_store_name=self.name, + ) + + def cp_object(self, src, dst): + check.str_param(src, 'src') + check.str_param(dst, 'dst') + + # Manually recurse and copy anything that looks like a file. + for src_blob_properties in self.container_client.list_blobs(src): + # This is the only way I can find to identify a 'directory' + if src_blob_properties['content_settings'] is None: + # Ignore this blob + continue + src_blob = self.container_client.get_blob_client(src_blob_properties['name']) + dst_blob_path = re.sub(r'^{}'.format(src), dst, src_blob_properties['name']) + dst_blob = self.container_client.get_blob_client(dst_blob_path) + dst_blob.start_copy_from_url(src_blob.url) + + return ObjectStoreOperation( + op=ObjectStoreOperationType.CP_OBJECT, + key=self.uri_for_key(src), + dest_key=self.uri_for_key(dst), + object_store_name=self.name, + ) + + def uri_for_key(self, key, protocol=None): + check.str_param(key, 'key') + protocol = check.opt_str_param(protocol, 'protocol', default='https://') + return '{protocol}@{account}.blob.core.windows.net/{container}/{key}'.format( + protocol=protocol, + account=self.blob_client.account_name, + container=self.container_client.container_name, + key=key, + ) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py new file mode 100644 index 0000000000000..01f01d156fbe9 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py @@ -0,0 +1,15 @@ +from azure.storage.blob import BlobServiceClient + + +def _create_url(storage_account, subdomain): + return "https://{}.{}.core.windows.net/".format(storage_account, subdomain) + + +def create_blob_client(storage_account, credential): + """ + Create a Blob Storage client. + """ + account_url = _create_url(storage_account, "blob") + if hasattr(credential, "account_key"): + credential = credential.account_key + return BlobServiceClient(account_url, credential) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/version.py b/python_modules/libraries/dagster-azure/dagster_azure/version.py new file mode 100644 index 0000000000000..3d73cc3e7f754 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/version.py @@ -0,0 +1,3 @@ +__version__ = '0.7.12' + +__nightly__ = '2020.05.11' diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/__init__.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/__init__.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py new file mode 100644 index 0000000000000..da13cc0c80694 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py @@ -0,0 +1,16 @@ +import pytest + + +@pytest.fixture(scope='session') +def storage_account(): + yield 'dagsterdatabrickstests' + + +@pytest.fixture(scope='session') +def file_system(): + yield 'dagster-databricks-tests' + + +@pytest.fixture(scope='session') +def credential(): + yield 'super-secret-creds' diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py new file mode 100644 index 0000000000000..1e5ef13d0c313 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py @@ -0,0 +1,60 @@ +import io + +from dagster_azure.adls2 import ADLS2FakeClient, ADLS2FileCache, ADLS2FileHandle + + +def test_adls2_file_cache_file_not_present(storage_account, file_system, credential): + fake_client = ADLS2FakeClient(storage_account, credential) + file_store = ADLS2FileCache( + storage_account=storage_account, + file_system=file_system, + prefix='some-prefix', + client=fake_client, + overwrite=False, + ) + + assert not file_store.has_file_object('foo') + + +def test_adls2_file_cache_file_present(storage_account, file_system, credential): + fake_client = ADLS2FakeClient(storage_account, credential) + file_store = ADLS2FileCache( + storage_account=storage_account, + file_system=file_system, + prefix='some-prefix', + client=fake_client, + overwrite=False, + ) + + assert not file_store.has_file_object('foo') + + file_store.write_binary_data('foo', 'bar'.encode()) + + assert file_store.has_file_object('foo') + + +def test_adls2_file_cache_correct_handle(storage_account, file_system, credential): + fake_client = ADLS2FakeClient(storage_account, credential) + file_store = ADLS2FileCache( + storage_account=storage_account, + file_system=file_system, + prefix='some-prefix', + client=fake_client, + overwrite=False, + ) + + assert isinstance(file_store.get_file_handle('foo'), ADLS2FileHandle) + + +def test_adls2_file_cache_write_file_object(storage_account, file_system, credential): + fake_client = ADLS2FakeClient(storage_account, credential) + file_store = ADLS2FileCache( + storage_account=storage_account, + file_system=file_system, + prefix='some-prefix', + client=fake_client, + overwrite=False, + ) + + stream = io.BytesIO('content'.encode()) + file_store.write_file_object('foo', stream) diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py new file mode 100644 index 0000000000000..4882f36dc7489 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py @@ -0,0 +1,199 @@ +import uuid + +from dagster_azure.adls2 import ( + ADLS2FileHandle, + ADLS2FileManager, + create_adls2_fake_resource, + adls2_plus_default_storage_defs, +) +from dagster_azure.blob import create_blob_fake_resource + +from dagster import ( + InputDefinition, + Int, + ModeDefinition, + OutputDefinition, + ResourceDefinition, + execute_pipeline, + pipeline, + solid, +) +from dagster.seven import mock + +# For deps + + +def test_adls2_file_manager_write(storage_account, file_system): + file_mock = mock.MagicMock() + adls2_mock = mock.MagicMock() + adls2_mock.get_file_client.return_value = file_mock + adls2_mock.account_name = storage_account + file_manager = ADLS2FileManager(adls2_mock, file_system, 'some-key') + + foo_bytes = 'foo'.encode() + + file_handle = file_manager.write_data(foo_bytes) + + assert isinstance(file_handle, ADLS2FileHandle) + + assert file_handle.account == storage_account + assert file_handle.file_system == file_system + assert file_handle.key.startswith('some-key/') + + assert file_mock.upload_data.call_count == 1 + + file_handle = file_manager.write_data(foo_bytes, ext='foo') + + assert isinstance(file_handle, ADLS2FileHandle) + + assert file_handle.account == storage_account + assert file_handle.file_system == file_system + assert file_handle.key.startswith('some-key/') + assert file_handle.key[-4:] == '.foo' + + assert file_mock.upload_data.call_count == 2 + + +def test_adls2_file_manager_read(storage_account, file_system): + state = {'called': 0} + bar_bytes = 'bar'.encode() + + class DownloadMock(mock.MagicMock): + def readinto(self, fileobj): + fileobj.write(bar_bytes) + + class FileMock(mock.MagicMock): + def download_file(self): + state['called'] += 1 + assert state['called'] == 1 + return DownloadMock(file=self) + + class ADLS2Mock(mock.MagicMock): + def get_file_client(self, *_args, **kwargs): + state['file_system'] = kwargs['file_system'] + file_path = kwargs['file_path'] + state['file_path'] = kwargs['file_path'] + return FileMock(file_path=file_path) + + adls2_mock = ADLS2Mock() + file_manager = ADLS2FileManager(adls2_mock, file_system, 'some-key') + file_handle = ADLS2FileHandle(storage_account, file_system, 'some-key/kdjfkjdkfjkd') + with file_manager.read(file_handle) as file_obj: + assert file_obj.read() == bar_bytes + + assert state['file_system'] == file_handle.file_system + assert state['file_path'] == file_handle.key + + # read again. cached + with file_manager.read(file_handle) as file_obj: + assert file_obj.read() == bar_bytes + + file_manager.delete_local_temp() + + +@mock.patch('dagster_azure.adls2.object_store.create_blob_client') +def test_depends_on_adls2_resource_intermediates( + mock_create_blob_client, storage_account, file_system +): + @solid( + input_defs=[InputDefinition('num_one', Int), InputDefinition('num_two', Int)], + output_defs=[OutputDefinition(Int)], + ) + def add_numbers(_, num_one, num_two): + return num_one + num_two + + mock_create_blob_client.return_value = create_blob_fake_resource(storage_account) + adls2_fake_resource = create_adls2_fake_resource(storage_account) + + @pipeline( + mode_defs=[ + ModeDefinition( + system_storage_defs=adls2_plus_default_storage_defs, + resource_defs={'adls2': ResourceDefinition.hardcoded_resource(adls2_fake_resource)}, + ) + ] + ) + def adls2_internal_pipeline(): + return add_numbers() + + result = execute_pipeline( + adls2_internal_pipeline, + environment_dict={ + 'solids': { + 'add_numbers': {'inputs': {'num_one': {'value': 2}, 'num_two': {'value': 4}}} + }, + 'storage': {'adls2': {'config': {'adls2_file_system': file_system}}}, + }, + ) + + assert result.success + assert result.result_for_solid('add_numbers').output_value() == 6 + + assert file_system in adls2_fake_resource.file_systems + + keys = set() + for step_key, output_name in [('add_numbers.compute', 'result')]: + keys.add(create_adls2_key(result.run_id, step_key, output_name)) + + assert set(adls2_fake_resource.file_systems[file_system].keys()) == keys + + +def create_adls2_key(run_id, step_key, output_name): + return 'dagster/storage/{run_id}/intermediates/{step_key}/{output_name}'.format( + run_id=run_id, step_key=step_key, output_name=output_name + ) + + +@mock.patch('dagster_azure.adls2.object_store.create_blob_client') +def test_depends_on_adls2_resource_file_manager( + mock_create_blob_client, storage_account, file_system +): + bar_bytes = 'bar'.encode() + + @solid(output_defs=[OutputDefinition(ADLS2FileHandle)]) + def emit_file(context): + return context.file_manager.write_data(bar_bytes) + + @solid(input_defs=[InputDefinition('file_handle', ADLS2FileHandle)]) + def accept_file(context, file_handle): + local_path = context.file_manager.copy_handle_to_local_temp(file_handle) + assert isinstance(local_path, str) + assert open(local_path, 'rb').read() == bar_bytes + + mock_create_blob_client.return_value = create_blob_fake_resource(storage_account) + adls2_fake_resource = create_adls2_fake_resource(storage_account) + + @pipeline( + mode_defs=[ + ModeDefinition( + system_storage_defs=adls2_plus_default_storage_defs, + resource_defs={'adls2': ResourceDefinition.hardcoded_resource(adls2_fake_resource)}, + ) + ] + ) + def adls2_file_manager_test(): + accept_file(emit_file()) + + result = execute_pipeline( + adls2_file_manager_test, + environment_dict={'storage': {'adls2': {'config': {'adls2_file_system': file_system}}}}, + ) + + assert result.success + + keys_in_bucket = set(adls2_fake_resource.file_systems[file_system].keys()) + + for step_key, output_name in [ + ('emit_file.compute', 'result'), + ('accept_file.compute', 'result'), + ]: + keys_in_bucket.remove(create_adls2_key(result.run_id, step_key, output_name)) + + assert len(keys_in_bucket) == 1 + + file_key = list(keys_in_bucket)[0] + comps = file_key.split('/') + + assert '/'.join(comps[:-1]) == 'dagster/storage/{run_id}/files'.format(run_id=result.run_id) + + assert uuid.UUID(comps[-1]) diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py new file mode 100644 index 0000000000000..a93f874228352 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py @@ -0,0 +1,457 @@ +import csv +import os +from collections import OrderedDict + +import pytest +from dagster_azure.adls2 import ( + ADLS2IntermediateStore, + adls2_plus_default_storage_defs, + adls2_resource, + create_adls2_client, +) + +from dagster import ( + Bool, + InputDefinition, + Int, + List, + ModeDefinition, + OutputDefinition, + PipelineRun, + SerializationStrategy, + String, + check, + execute_pipeline, + lambda_solid, + pipeline, + usable_as_dagster_type, +) +from dagster.core.events import DagsterEventType +from dagster.core.execution.api import create_execution_plan, execute_plan, scoped_pipeline_context +from dagster.core.execution.plan.objects import StepOutputHandle +from dagster.core.instance import DagsterInstance +from dagster.core.storage.intermediates_manager import IntermediateStoreIntermediatesManager +from dagster.core.storage.type_storage import TypeStoragePlugin, TypeStoragePluginRegistry +from dagster.core.types.dagster_type import Bool as RuntimeBool +from dagster.core.types.dagster_type import String as RuntimeString +from dagster.core.types.dagster_type import create_any_type, resolve_dagster_type +from dagster.core.utils import make_new_run_id +from dagster.utils.test import yield_empty_pipeline_context + + +class UppercaseSerializationStrategy(SerializationStrategy): # pylint: disable=no-init + def serialize(self, value, write_file_obj): + return write_file_obj.write(bytes(value.upper().encode('utf-8'))) + + def deserialize(self, read_file_obj): + return read_file_obj.read().decode('utf-8').lower() + + +LowercaseString = create_any_type( + 'LowercaseString', serialization_strategy=UppercaseSerializationStrategy('uppercase'), +) + + +nettest = pytest.mark.nettest + + +def define_inty_pipeline(should_throw=True): + @lambda_solid + def return_one(): + return 1 + + @lambda_solid(input_defs=[InputDefinition('num', Int)], output_def=OutputDefinition(Int)) + def add_one(num): + return num + 1 + + @lambda_solid + def user_throw_exception(): + raise Exception('whoops') + + @pipeline( + mode_defs=[ + ModeDefinition( + system_storage_defs=adls2_plus_default_storage_defs, + resource_defs={'adls2': adls2_resource}, + ) + ] + ) + def basic_external_plan_execution(): + add_one(return_one()) + if should_throw: + user_throw_exception() + + return basic_external_plan_execution + + +def get_step_output(step_events, step_key, output_name='result'): + for step_event in step_events: + if ( + step_event.event_type == DagsterEventType.STEP_OUTPUT + and step_event.step_key == step_key + and step_event.step_output_data.output_name == output_name + ): + return step_event + return None + + +def get_azure_credential(): + try: + return {'key': os.environ["AZURE_STORAGE_ACCOUNT_KEY"]} + except KeyError as err: + raise Exception("AZURE_STORAGE_ACCOUNT_KEY must be set for intermediate store tests") + + +def get_client(storage_account): + creds = get_azure_credential()["key"] + return create_adls2_client(storage_account, creds) + + +@nettest +def test_using_adls2_for_subplan(storage_account, file_system): + pipeline_def = define_inty_pipeline() + + environment_dict = { + 'resources': { + 'adls2': { + 'config': {'storage_account': storage_account, 'credential': get_azure_credential()} + } + }, + 'storage': {'adls2': {'config': {'adls2_file_system': file_system}}}, + } + + run_id = make_new_run_id() + + execution_plan = create_execution_plan(pipeline_def, environment_dict=environment_dict) + + assert execution_plan.get_step_by_key('return_one.compute') + + step_keys = ['return_one.compute'] + instance = DagsterInstance.ephemeral() + pipeline_run = PipelineRun( + pipeline_name=pipeline_def.name, run_id=run_id, environment_dict=environment_dict + ) + + return_one_step_events = list( + execute_plan( + execution_plan.build_subset_plan(step_keys), + environment_dict=environment_dict, + pipeline_run=pipeline_run, + instance=instance, + ) + ) + + assert get_step_output(return_one_step_events, 'return_one.compute') + with scoped_pipeline_context( + execution_plan.build_subset_plan(['return_one.compute']), + environment_dict, + pipeline_run, + instance, + ) as context: + + store = ADLS2IntermediateStore( + file_system=file_system, + run_id=run_id, + client=context.scoped_resources_builder.build(required_resource_keys={'adls2'}).adls2, + ) + intermediates_manager = IntermediateStoreIntermediatesManager(store) + step_output_handle = StepOutputHandle('return_one.compute') + assert intermediates_manager.has_intermediate(context, step_output_handle) + assert intermediates_manager.get_intermediate(context, Int, step_output_handle).obj == 1 + + add_one_step_events = list( + execute_plan( + execution_plan.build_subset_plan(['add_one.compute']), + environment_dict=environment_dict, + pipeline_run=pipeline_run, + instance=instance, + ) + ) + + assert get_step_output(add_one_step_events, 'add_one.compute') + with scoped_pipeline_context( + execution_plan.build_subset_plan(['add_one.compute']), + environment_dict, + pipeline_run, + instance, + ) as context: + step_output_handle = StepOutputHandle('add_one.compute') + assert intermediates_manager.has_intermediate(context, step_output_handle) + assert intermediates_manager.get_intermediate(context, Int, step_output_handle).obj == 2 + + +class FancyStringS3TypeStoragePlugin(TypeStoragePlugin): # pylint:disable=no-init + @classmethod + def compatible_with_storage_def(cls, _): + # Not needed for these tests + raise NotImplementedError() + + @classmethod + def set_object(cls, intermediate_store, obj, context, dagster_type, paths): + check.inst_param(intermediate_store, 'intermediate_store', ADLS2IntermediateStore) + paths.append(obj) + return intermediate_store.set_object('', context, dagster_type, paths) + + @classmethod + def get_object(cls, intermediate_store, _context, _dagster_type, paths): + check.inst_param(intermediate_store, 'intermediate_store', ADLS2IntermediateStore) + res = intermediate_store.object_store.file_system_client.get_paths( + intermediate_store.key_for_paths(paths) + ) + return next(res).name.split('/')[-1] + + +@nettest +def test_adls2_intermediate_store_with_type_storage_plugin(storage_account, file_system): + run_id = make_new_run_id() + + intermediate_store = ADLS2IntermediateStore( + client=get_client(storage_account), + run_id=run_id, + file_system=file_system, + type_storage_plugin_registry=TypeStoragePluginRegistry( + [(RuntimeString, FancyStringS3TypeStoragePlugin)] + ), + ) + + with yield_empty_pipeline_context(run_id=run_id) as context: + try: + intermediate_store.set_value('hello', context, RuntimeString, ['obj_name']) + + assert intermediate_store.has_object(context, ['obj_name']) + assert intermediate_store.get_value(context, RuntimeString, ['obj_name']) == 'hello' + + finally: + intermediate_store.rm_object(context, ['obj_name']) + + +@nettest +def test_adls2_intermediate_store_with_composite_type_storage_plugin(storage_account, file_system): + run_id = make_new_run_id() + + intermediate_store = ADLS2IntermediateStore( + client=get_client(storage_account), + run_id=run_id, + file_system=file_system, + type_storage_plugin_registry=TypeStoragePluginRegistry( + [(RuntimeString, FancyStringS3TypeStoragePlugin)] + ), + ) + with yield_empty_pipeline_context(run_id=run_id) as context: + with pytest.raises(check.NotImplementedCheckError): + intermediate_store.set_value( + ['hello'], context, resolve_dagster_type(List[String]), ['obj_name'] + ) + + +@nettest +def test_adls2_intermediate_store_composite_types_with_custom_serializer_for_inner_type( + storage_account, file_system +): + run_id = make_new_run_id() + + intermediate_store = ADLS2IntermediateStore( + client=get_client(storage_account), run_id=run_id, file_system=file_system, + ) + + obj_name = 'list' + + with yield_empty_pipeline_context(run_id=run_id) as context: + try: + intermediate_store.set_object( + ['foo', 'bar'], context, resolve_dagster_type(List[LowercaseString]), [obj_name], + ) + assert intermediate_store.has_object(context, [obj_name]) + assert intermediate_store.get_object( + context, resolve_dagster_type(List[Bool]), [obj_name] + ).obj == ['foo', 'bar'] + + finally: + intermediate_store.rm_object(context, [obj_name]) + + +@nettest +def test_adls2_intermediate_store_with_custom_serializer(storage_account, file_system): + run_id = make_new_run_id() + + intermediate_store = ADLS2IntermediateStore( + client=get_client(storage_account), run_id=run_id, file_system=file_system, + ) + + with yield_empty_pipeline_context(run_id=run_id) as context: + try: + intermediate_store.set_object('foo', context, LowercaseString, ['foo']) + + assert ( + intermediate_store.object_store.file_system_client.get_file_client( + '/'.join([intermediate_store.root] + ['foo']), + ) + .download_file() + .readall() + .decode('utf-8') + == 'FOO' + ) + + assert intermediate_store.has_object(context, ['foo']) + assert intermediate_store.get_object(context, LowercaseString, ['foo']).obj == 'foo' + finally: + intermediate_store.rm_object(context, ['foo']) + + +@nettest +def test_adls2_pipeline_with_custom_prefix(storage_account, file_system): + adls2_prefix = 'custom_prefix' + + pipe = define_inty_pipeline(should_throw=False) + environment_dict = { + 'resources': { + 'adls2': { + 'config': {'storage_account': storage_account, 'credential': get_azure_credential()} + } + }, + 'storage': { + 'adls2': {'config': {'adls2_file_system': file_system, 'adls2_prefix': adls2_prefix}} + }, + } + + pipeline_run = PipelineRun(pipeline_name=pipe.name, environment_dict=environment_dict) + instance = DagsterInstance.ephemeral() + + result = execute_pipeline(pipe, environment_dict=environment_dict,) + assert result.success + + execution_plan = create_execution_plan(pipe, environment_dict) + with scoped_pipeline_context( + execution_plan, environment_dict, pipeline_run, instance, + ) as context: + store = ADLS2IntermediateStore( + run_id=result.run_id, + file_system=file_system, + prefix=adls2_prefix, + client=context.scoped_resources_builder.build(required_resource_keys={'adls2'}).adls2, + ) + intermediates_manager = IntermediateStoreIntermediatesManager(store) + assert store.root == '/'.join(['custom_prefix', 'storage', result.run_id]) + assert ( + intermediates_manager.get_intermediate( + context, Int, StepOutputHandle('return_one.compute') + ).obj + == 1 + ) + assert ( + intermediates_manager.get_intermediate( + context, Int, StepOutputHandle('add_one.compute') + ).obj + == 2 + ) + + +@nettest +def test_adls2_intermediate_store_with_custom_prefix(storage_account, file_system): + run_id = make_new_run_id() + + intermediate_store = ADLS2IntermediateStore( + client=get_client(storage_account), + run_id=run_id, + file_system=file_system, + prefix='custom_prefix', + ) + assert intermediate_store.root == '/'.join(['custom_prefix', 'storage', run_id]) + + try: + with yield_empty_pipeline_context(run_id=run_id) as context: + + intermediate_store.set_object(True, context, RuntimeBool, ['true']) + + assert intermediate_store.has_object(context, ['true']) + assert intermediate_store.uri_for_paths(['true']).startswith( + 'abfss://{fs}@{account}.dfs.core.windows.net/custom_prefix'.format( + account=storage_account, fs=file_system + ) + ) + + finally: + intermediate_store.rm_object(context, ['true']) + + +@nettest +def test_adls2_intermediate_store(storage_account, file_system): + run_id = make_new_run_id() + run_id_2 = make_new_run_id() + + intermediate_store = ADLS2IntermediateStore( + client=get_client(storage_account), run_id=run_id, file_system=file_system, + ) + assert intermediate_store.root == '/'.join(['dagster', 'storage', run_id]) + + intermediate_store_2 = ADLS2IntermediateStore( + client=get_client(storage_account), run_id=run_id_2, file_system=file_system, + ) + assert intermediate_store_2.root == '/'.join(['dagster', 'storage', run_id_2]) + + try: + with yield_empty_pipeline_context(run_id=run_id) as context: + + intermediate_store.set_object(True, context, RuntimeBool, ['true']) + + assert intermediate_store.has_object(context, ['true']) + assert intermediate_store.get_object(context, RuntimeBool, ['true']).obj is True + assert intermediate_store.uri_for_paths(['true']).startswith('abfss://') + + intermediate_store_2.copy_object_from_run(context, run_id, ['true']) + assert intermediate_store_2.has_object(context, ['true']) + assert intermediate_store_2.get_object(context, RuntimeBool, ['true']).obj is True + finally: + intermediate_store.rm_object(context, ['true']) + intermediate_store_2.rm_object(context, ['true']) + + +class CsvSerializationStrategy(SerializationStrategy): + def __init__(self): + super(CsvSerializationStrategy, self).__init__( + "csv_strategy", read_mode="r", write_mode="w" + ) + + def serialize(self, value, write_file_obj): + fieldnames = value[0] + writer = csv.DictWriter(write_file_obj, fieldnames) + writer.writeheader() + writer.writerows(value) + + def deserialize(self, read_file_obj): + reader = csv.DictReader(read_file_obj) + return LessSimpleDataFrame([row for row in reader]) + + +@usable_as_dagster_type( + name="LessSimpleDataFrame", + description=("A naive representation of a data frame, e.g., as returned by " "csv.DictReader."), + serialization_strategy=CsvSerializationStrategy(), +) +class LessSimpleDataFrame(list): + pass + + +def test_custom_read_write_mode(storage_account, file_system): + run_id = make_new_run_id() + data_frame = [OrderedDict({'foo': '1', 'bar': '1'}), OrderedDict({'foo': '2', 'bar': '2'})] + try: + with yield_empty_pipeline_context(run_id=run_id) as context: + intermediate_store = ADLS2IntermediateStore( + client=get_client(storage_account), run_id=run_id, file_system=file_system, + ) + intermediate_store.set_object( + data_frame, context, resolve_dagster_type(LessSimpleDataFrame), ['data_frame'] + ) + + assert intermediate_store.has_object(context, ['data_frame']) + assert ( + intermediate_store.get_object( + context, resolve_dagster_type(LessSimpleDataFrame), ['data_frame'] + ).obj + == data_frame + ) + assert intermediate_store.uri_for_paths(['data_frame']).startswith('abfss://') + + finally: + intermediate_store.rm_object(context, ['data_frame']) diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py new file mode 100644 index 0000000000000..b8b8713f504df --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py @@ -0,0 +1,37 @@ +from dagster_azure.adls2 import ADLS2ObjectStore, ADLS2FakeClient + +from dagster.core.storage.object_store import DEFAULT_SERIALIZATION_STRATEGY + + +def test_adls2_object_store( + storage_account, credential, file_system, caplog +): # pylint: disable=too-many-function-args + adls2_fake_client = ADLS2FakeClient(storage_account, credential) + + key = 'foo' + # Uses mock ADLS2 client + adls2_obj_store = ADLS2ObjectStore(file_system, client=adls2_fake_client) + res = adls2_obj_store.set_object(key, True, DEFAULT_SERIALIZATION_STRATEGY) + assert res.key == 'abfss://{fs}@{account}.dfs.core.windows.net/{key}'.format( + fs=file_system, account=storage_account, key=key + ) + + adls2_obj_store.set_object(key, True, DEFAULT_SERIALIZATION_STRATEGY) + assert 'Removing existing ADLS2 key' in caplog.text + + assert adls2_obj_store.has_object(key) + assert adls2_obj_store.get_object(key, DEFAULT_SERIALIZATION_STRATEGY).obj is True + + # Harder to test this since it requires a fake synchronised Blob client, + # since cp_object uses blob APIs to communicate... + # adls2_obj_store.cp_object(key, 'bar') + # assert adls2_obj_store.has_object('bar') + + adls2_obj_store.rm_object(key) + assert not adls2_obj_store.has_object(key) + + assert adls2_obj_store.uri_for_key( + key + ) == 'abfss://{fs}@{account}.dfs.core.windows.net/{key}'.format( + fs=file_system, account=storage_account, key=key + ) diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/conftest.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/conftest.py new file mode 100644 index 0000000000000..655e26bcd68aa --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/conftest.py @@ -0,0 +1,16 @@ +import pytest + + +@pytest.fixture(scope='session') +def storage_account(): + yield 'dagsterdatabrickstests' + + +@pytest.fixture(scope='session') +def container(): + yield 'dagster-databricks-tests' + + +@pytest.fixture(scope='session') +def credential(): + yield 'super-secret-creds' diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py new file mode 100644 index 0000000000000..14e443d93dc16 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py @@ -0,0 +1,127 @@ +import os +import sys + +import six +from dagster_azure.blob import AzureBlobComputeLogManager, create_blob_fake_resource + +from dagster import DagsterEventType, execute_pipeline, pipeline, seven, solid +from dagster.core.instance import DagsterInstance, InstanceType +from dagster.core.storage.compute_log_manager import ComputeIOType +from dagster.core.storage.event_log import SqliteEventLogStorage +from dagster.core.storage.root import LocalArtifactStorage +from dagster.core.storage.runs import SqliteRunStorage +from dagster.seven import mock + +HELLO_WORLD = 'Hello World' +SEPARATOR = os.linesep if (os.name == 'nt' and sys.version_info < (3,)) else '\n' +EXPECTED_LOGS = [ + 'STEP_START - Started execution of step "easy.compute".', + 'STEP_OUTPUT - Yielded output "result" of type "Any". (Type check passed).', + 'STEP_SUCCESS - Finished execution of step "easy.compute"', +] + + +@mock.patch('dagster_azure.blob.compute_log_manager.generate_blob_sas') +@mock.patch('dagster_azure.blob.compute_log_manager.create_blob_client') +def test_compute_log_manager( + mock_create_blob_client, mock_generate_blob_sas, storage_account, container, credential +): + mock_generate_blob_sas.return_value = 'fake-url' + fake_client = create_blob_fake_resource(storage_account) + mock_create_blob_client.return_value = fake_client + + @pipeline + def simple(): + @solid + def easy(context): + context.log.info('easy') + print(HELLO_WORLD) + return 'easy' + + easy() + + with seven.TemporaryDirectory() as temp_dir: + run_store = SqliteRunStorage.from_local(temp_dir) + event_store = SqliteEventLogStorage(temp_dir) + manager = AzureBlobComputeLogManager( + storage_account=storage_account, + container=container, + prefix='my_prefix', + local_dir=temp_dir, + secret_key=credential, + ) + instance = DagsterInstance( + instance_type=InstanceType.PERSISTENT, + local_artifact_storage=LocalArtifactStorage(temp_dir), + run_storage=run_store, + event_storage=event_store, + compute_log_manager=manager, + ) + result = execute_pipeline(simple, instance=instance) + compute_steps = [ + event.step_key + for event in result.step_event_list + if event.event_type == DagsterEventType.STEP_START + ] + assert len(compute_steps) == 1 + step_key = compute_steps[0] + + stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) + assert stdout.data == HELLO_WORLD + SEPARATOR + + stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) + for expected in EXPECTED_LOGS: + assert expected in stderr.data + + # Check ADLS2 directly + adls2_object = fake_client.get_blob_client( + container=container, + blob='{prefix}/storage/{run_id}/compute_logs/easy.compute.err'.format( + prefix='my_prefix', run_id=result.run_id + ), + ) + adls2_stderr = six.ensure_str(adls2_object.download_blob().readall()) + for expected in EXPECTED_LOGS: + assert expected in adls2_stderr + + # Check download behavior by deleting locally cached logs + compute_logs_dir = os.path.join(temp_dir, result.run_id, 'compute_logs') + for filename in os.listdir(compute_logs_dir): + os.unlink(os.path.join(compute_logs_dir, filename)) + + stdout = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDOUT) + assert stdout.data == HELLO_WORLD + SEPARATOR + + stderr = manager.read_logs_file(result.run_id, step_key, ComputeIOType.STDERR) + for expected in EXPECTED_LOGS: + assert expected in stderr.data + + +def test_compute_log_manager_from_config(storage_account, container, credential): + prefix = 'foobar' + + dagster_yaml = ''' +compute_logs: + module: dagster_azure.blob.compute_log_manager + class: AzureBlobComputeLogManager + config: + storage_account: "{storage_account}" + container: {container} + secret_key: {credential} + local_dir: "/tmp/cool" + prefix: "{prefix}" +'''.format( + storage_account=storage_account, container=container, credential=credential, prefix=prefix + ) + + with seven.TemporaryDirectory() as tempdir: + with open(os.path.join(tempdir, 'dagster.yaml'), 'wb') as f: + f.write(six.ensure_binary(dagster_yaml)) + + instance = DagsterInstance.from_config(tempdir) + assert ( + instance.compute_log_manager._storage_account # pylint: disable=protected-access + == storage_account + ) + assert instance.compute_log_manager._container == container # pylint: disable=protected-access + assert instance.compute_log_manager._blob_prefix == prefix # pylint: disable=protected-access diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/test_version.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/test_version.py new file mode 100644 index 0000000000000..355494028ba40 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/test_version.py @@ -0,0 +1,5 @@ +from dagster_azure.version import __version__ + + +def test_version(): + assert __version__ diff --git a/python_modules/libraries/dagster-azure/dev-requirements.txt b/python_modules/libraries/dagster-azure/dev-requirements.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/python_modules/libraries/dagster-azure/setup.cfg b/python_modules/libraries/dagster-azure/setup.cfg new file mode 100644 index 0000000000000..8183238ab1c7f --- /dev/null +++ b/python_modules/libraries/dagster-azure/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +license_files = LICENSE diff --git a/python_modules/libraries/dagster-azure/setup.py b/python_modules/libraries/dagster-azure/setup.py new file mode 100644 index 0000000000000..040c2a539e816 --- /dev/null +++ b/python_modules/libraries/dagster-azure/setup.py @@ -0,0 +1,53 @@ +import argparse +import sys + +from setuptools import find_packages, setup + + +def get_version(name): + version = {} + with open('dagster_azure/version.py') as fp: + exec(fp.read(), version) # pylint: disable=W0122 + + if name == 'dagster-azure': + return version['__version__'] + elif name == 'dagster-azure-nightly': + return version['__nightly__'] + else: + raise Exception('Shouldn\'t be here: bad package name {name}'.format(name=name)) + + +parser = argparse.ArgumentParser() +parser.add_argument('--nightly', action='store_true') + + +def _do_setup(name='dagster-azure'): + setup( + name=name, + version=get_version(name), + author='Elementl', + license='Apache-2.0', + description='Package for Azure-specific Dagster framework solid and resource components.', + url='https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-azure', + classifiers=[ + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + ], + packages=find_packages(exclude=['test']), + include_package_data=True, + install_requires=['azure-storage-file-datalake~=12.0.1', 'dagster'], + entry_points={'console_scripts': ['dagster-azure = dagster_azure.cli.cli:main']}, + zip_safe=False, + ) + + +if __name__ == '__main__': + parsed, unparsed = parser.parse_known_args() + sys.argv = [sys.argv[0]] + unparsed + if parsed.nightly: + _do_setup('dagster-azure-nightly') + else: + _do_setup('dagster-azure') diff --git a/python_modules/libraries/dagster-azure/tox.ini b/python_modules/libraries/dagster-azure/tox.ini new file mode 100644 index 0000000000000..da4d1347591c1 --- /dev/null +++ b/python_modules/libraries/dagster-azure/tox.ini @@ -0,0 +1,24 @@ +[tox] +envlist = py{37,36,35,27}-{unix,windows} + +[testenv] +passenv = CI_* COVERALLS_REPO_TOKEN AZURE_* BUILDKITE SSH_* +deps = + -e ../../dagster + -r ../../dagster/dev-requirements.txt + -e ../dagster-spark + -e ../dagster-pyspark + -r ./dev-requirements.txt + -e . +usedevelop = true +whitelist_externals = + /bin/bash + echo +commands = + !windows: /bin/bash -c '! pip list --exclude-editable | grep -e dagster -e dagit' + coverage erase + echo -e "--- \033[0;32m:pytest: Running tox tests\033[0m" + pytest -vv --junitxml=test_results.xml --cov=dagster_azure --cov-append --cov-report= + coverage report --omit='.tox/*,**/test_*.py' --skip-covered + coverage html --omit='.tox/*,**/test_*.py' + coverage xml --omit='.tox/*,**/test_*.py' From 21cbe6e76a184006a4cd09bb6802e18dca9e5f05 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 21 May 2020 21:52:39 +0100 Subject: [PATCH 02/21] Rename Fake Azure classes and modules to more English-friendly names --- .../dagster_azure/adls2/__init__.py | 2 +- .../adls2/adls2_fake_resource.py | 143 --------------- .../dagster_azure/blob/__init__.py | 2 +- .../dagster_azure/blob/blob_fake_resource.py | 164 ------------------ .../adls2_tests/test_adls2_file_cache.py | 10 +- .../adls2_tests/test_adls2_file_manager.py | 12 +- .../adls2_tests/test_object_store.py | 4 +- .../blob_tests/test_compute_log_manager.py | 4 +- 8 files changed, 17 insertions(+), 324 deletions(-) delete mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/adls2_fake_resource.py delete mode 100644 python_modules/libraries/dagster-azure/dagster_azure/blob/blob_fake_resource.py diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py index c2246756a6250..dd43d9597b337 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py @@ -1,4 +1,4 @@ -from .adls2_fake_resource import ADLS2FakeClient, create_adls2_fake_resource +from .fake_adls2_resource import FakeADLS2ServiceClient from .file_cache import ADLS2FileCache, adls2_file_cache from .file_manager import ADLS2FileHandle, ADLS2FileManager from .intermediate_store import ADLS2IntermediateStore diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/adls2_fake_resource.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/adls2_fake_resource.py deleted file mode 100644 index 25d187d37eb4d..0000000000000 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/adls2_fake_resource.py +++ /dev/null @@ -1,143 +0,0 @@ -from collections import defaultdict -from contextlib import contextmanager -import io -import random - -from azure.core.exceptions import ResourceNotFoundError - -from dagster.seven import mock - - -def create_adls2_fake_resource(account_name): - '''Create a mock ADLS2 client for test.''' - return ADLS2FakeClient(account_name, 'fake-creds') - - -class ADLS2FakeClient(object): - '''Stateful mock of an ADLS2 client for testing. - - Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict. - ''' - - def __init__(self, account_name, credential): - - self._account_name = account_name - self._credential = mock.MagicMock() - self._credential.account_key = credential - self._file_systems = {} - - @property - def account_name(self): - return self._account_name - - @property - def credential(self): - return self._credential - - @property - def file_systems(self): - return self._file_systems - - def get_file_system_client(self, file_system): - return self._file_systems.setdefault( - file_system, ADLS2FakeFilesystemClient(self.account_name, file_system) - ) - - def get_file_client(self, file_system, file_path): - return self.get_file_system_client(file_system).get_file_client(file_path) - - -class ADLS2FakeFilesystemClient(object): - '''Stateful mock of an ADLS2 filesystem client for testing.''' - - def __init__(self, account_name, file_system_name): - self._file_system = defaultdict(ADLS2FakeFileClient) - self._account_name = account_name - self._file_system_name = file_system_name - - @property - def account_name(self): - return self._account_name - - @property - def file_system_name(self): - return self._file_system_name - - def keys(self): - return self._file_system.keys() - - def get_file_system_properties(self): - return {"account_name": self.account_name, "file_system_name": self.file_system_name} - - def has_file(self, path): - return bool(self._file_system.get(path)) - - def get_file_client(self, file_path): - return self._file_system[file_path] - - def create_file(self, file): - return self._file_system[file] - - def delete_file(self, file): - for k in list(self._file_system.keys()): - if k.startswith(file): - del self._file_system[k] - - -class ADLS2FakeFileClient(object): - '''Stateful mock of an ADLS2 file client for testing.''' - - def __init__(self): - self.contents = None - self.lease = None - - def get_file_properties(self): - if self.contents is None: - raise ResourceNotFoundError("File does not exist!") - return {"lease": self.lease} - - def upload_data(self, contents, overwrite=False, lease=None): - if self.lease is not None: - if lease != self.lease: - raise Exception("Invalid lease!") - if self.contents is not None or overwrite is True: - if isinstance(contents, str): - self.contents = contents.encode('utf8') - elif isinstance(contents, io.BytesIO): - self.contents = contents.read() - elif isinstance(contents, io.StringIO): - self.contents = contents.read().encode('utf8') - elif isinstance(contents, bytes): - self.contents = contents - else: - print("Uploading unknown data") - self.contents = contents - - @contextmanager - def acquire_lease(self, lease_duration=-1): # pylint: disable=unused-argument - if self.lease is None: - self.lease = random.randint(0, 2 ** 9) - try: - yield self.lease - finally: - self.lease = None - else: - raise Exception("Lease already held") - - def download_file(self): - if self.contents is None: - raise ResourceNotFoundError("File does not exist!") - return ADLS2FakeFileDownloader(contents=self.contents) - - -class ADLS2FakeFileDownloader(object): - '''Mock of an ADLS2 file downloader for testing.''' - - def __init__(self, contents): - self.contents = contents - - def readall(self): - return self.contents - - def readinto(self, fileobj): - fileobj.write(self.contents) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py index 9a1d43bfbf9fe..5add15da4b006 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py @@ -1,2 +1,2 @@ -from .blob_fake_resource import create_blob_fake_resource +from .fake_blob_client import FakeBlobServiceClient from .compute_log_manager import AzureBlobComputeLogManager diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/blob_fake_resource.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/blob_fake_resource.py deleted file mode 100644 index 2634b56913533..0000000000000 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/blob_fake_resource.py +++ /dev/null @@ -1,164 +0,0 @@ -from collections import defaultdict -from contextlib import contextmanager -import io -import random - -from azure.core.exceptions import ResourceNotFoundError - -from dagster.seven import mock - - -def create_blob_fake_resource(account_name): - '''Create a mock Blob client for test.''' - return BlobFakeClient(account_name, 'fake-creds') - - -class BlobFakeClient(object): - '''Stateful mock of an Blob client for testing. - - Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict. - ''' - - def __init__(self, account_name, credential): - - self._account_name = account_name - self._credential = mock.MagicMock() - self._credential.account_key = credential - self._containers = {} - - @property - def account_name(self): - return self._account_name - - @property - def credential(self): - return self._credential - - @property - def containers(self): - return self._containers - - def get_container_client(self, container): - return self._containers.setdefault( - container, BlobFakeContainerClient(self.account_name, container) - ) - - def get_blob_client(self, container, blob): - return self.get_container_client(container).get_blob_client(blob) - - -class BlobFakeContainerClient(object): - '''Stateful mock of an Blob container client for testing.''' - - def __init__(self, account_name, container_name): - self._container = defaultdict(BlobFakeBlobClient) - self._account_name = account_name - self._container_name = container_name - - @property - def account_name(self): - return self._account_name - - @property - def container_name(self): - return self._container_name - - def keys(self): - return self._container.keys() - - def get_container_properties(self): - return {"account_name": self.account_name, "container_name": self.container_name} - - def has_blob(self, path): - return bool(self._container.get(path)) - - def get_blob_client(self, blob): - return self._container[blob] - - def create_blob(self, blob): - return self._container[blob] - - def list_blobs(self, name_starts_with=None): - for k, v in self._container.items(): - if name_starts_with is None or k.startswith(name_starts_with): - yield { - 'name': k, - # This clearly isn't actually the URL but we need a way of copying contents - # across blobs and this allows us to do it - 'url': v.contents, - } - - def delete_blob(self, blob): - # Use list to avoid mutating dict as we iterate - for k in list(self._container.keys()): - if k.startswith(blob): - del self._container[k] - - -class BlobFakeBlobClient(object): - '''Stateful mock of an Blob blob client for testing.''' - - def __init__(self): - self.contents = None - self.lease = None - - def start_copy_from_url(self, url): - self.contents = url - - def get_blob_properties(self): - if self.contents is None: - raise ResourceNotFoundError("File does not exist!") - return {"lease": self.lease} - - def upload_blob(self, contents, overwrite=False, lease=None): - if self.lease is not None: - if lease != self.lease: - raise Exception("Invalid lease!") - if self.contents is None or overwrite is True: - if isinstance(contents, str): - self.contents = contents.encode('utf8') - elif isinstance(contents, io.TextIOBase): - self.contents = contents.read().encode('utf8') - elif isinstance(contents, io.IOBase): - self.contents = contents.read() - elif isinstance(contents, bytes): - self.contents = contents - # Python 2 compatibility - no base class for `file` type - elif hasattr(contents, 'read'): - self.contents = contents.read() - else: - print("Uploading unknown data") - self.contents = contents - - @property - def url(self): - return ':memory:' - - @contextmanager - def acquire_lease(self, lease_duration=-1): # pylint: disable=unused-argument - if self.lease is None: - self.lease = random.randint(0, 2 ** 9) - try: - yield self.lease - finally: - self.lease = None - else: - raise Exception("Lease already held") - - def download_blob(self): - if self.contents is None: - raise ResourceNotFoundError("File does not exist!") - return BlobFakeBlobDownloader(contents=self.contents) - - -class BlobFakeBlobDownloader(object): - '''Mock of a Blob file downloader for testing.''' - - def __init__(self, contents): - self.contents = contents - - def readall(self): - return self.contents - - def readinto(self, fileobj): - fileobj.write(self.contents) diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py index 1e5ef13d0c313..eae06d53ab9c1 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py @@ -1,10 +1,10 @@ import io -from dagster_azure.adls2 import ADLS2FakeClient, ADLS2FileCache, ADLS2FileHandle +from dagster_azure.adls2 import FakeADLS2ServiceClient, ADLS2FileCache, ADLS2FileHandle def test_adls2_file_cache_file_not_present(storage_account, file_system, credential): - fake_client = ADLS2FakeClient(storage_account, credential) + fake_client = FakeADLS2ServiceClient(storage_account, credential) file_store = ADLS2FileCache( storage_account=storage_account, file_system=file_system, @@ -17,7 +17,7 @@ def test_adls2_file_cache_file_not_present(storage_account, file_system, credent def test_adls2_file_cache_file_present(storage_account, file_system, credential): - fake_client = ADLS2FakeClient(storage_account, credential) + fake_client = FakeADLS2ServiceClient(storage_account, credential) file_store = ADLS2FileCache( storage_account=storage_account, file_system=file_system, @@ -34,7 +34,7 @@ def test_adls2_file_cache_file_present(storage_account, file_system, credential) def test_adls2_file_cache_correct_handle(storage_account, file_system, credential): - fake_client = ADLS2FakeClient(storage_account, credential) + fake_client = FakeADLS2ServiceClient(storage_account, credential) file_store = ADLS2FileCache( storage_account=storage_account, file_system=file_system, @@ -47,7 +47,7 @@ def test_adls2_file_cache_correct_handle(storage_account, file_system, credentia def test_adls2_file_cache_write_file_object(storage_account, file_system, credential): - fake_client = ADLS2FakeClient(storage_account, credential) + fake_client = FakeADLS2ServiceClient(storage_account, credential) file_store = ADLS2FileCache( storage_account=storage_account, file_system=file_system, diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py index 4882f36dc7489..b379a73f89eeb 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py @@ -3,10 +3,10 @@ from dagster_azure.adls2 import ( ADLS2FileHandle, ADLS2FileManager, - create_adls2_fake_resource, + FakeADLS2ServiceClient, adls2_plus_default_storage_defs, ) -from dagster_azure.blob import create_blob_fake_resource +from dagster_azure.blob import FakeBlobServiceClient from dagster import ( InputDefinition, @@ -102,8 +102,8 @@ def test_depends_on_adls2_resource_intermediates( def add_numbers(_, num_one, num_two): return num_one + num_two - mock_create_blob_client.return_value = create_blob_fake_resource(storage_account) - adls2_fake_resource = create_adls2_fake_resource(storage_account) + mock_create_blob_client.return_value = FakeBlobServiceClient(storage_account) + adls2_fake_resource = FakeADLS2ServiceClient(storage_account) @pipeline( mode_defs=[ @@ -160,8 +160,8 @@ def accept_file(context, file_handle): assert isinstance(local_path, str) assert open(local_path, 'rb').read() == bar_bytes - mock_create_blob_client.return_value = create_blob_fake_resource(storage_account) - adls2_fake_resource = create_adls2_fake_resource(storage_account) + mock_create_blob_client.return_value = FakeBlobServiceClient(storage_account) + adls2_fake_resource = FakeADLS2ServiceClient(storage_account) @pipeline( mode_defs=[ diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py index b8b8713f504df..e111bedadeb5d 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py @@ -1,4 +1,4 @@ -from dagster_azure.adls2 import ADLS2ObjectStore, ADLS2FakeClient +from dagster_azure.adls2 import ADLS2ObjectStore, FakeADLS2ServiceClient from dagster.core.storage.object_store import DEFAULT_SERIALIZATION_STRATEGY @@ -6,7 +6,7 @@ def test_adls2_object_store( storage_account, credential, file_system, caplog ): # pylint: disable=too-many-function-args - adls2_fake_client = ADLS2FakeClient(storage_account, credential) + adls2_fake_client = FakeADLS2ServiceClient(storage_account, credential) key = 'foo' # Uses mock ADLS2 client diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py index 14e443d93dc16..deb9576a512b0 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py @@ -2,7 +2,7 @@ import sys import six -from dagster_azure.blob import AzureBlobComputeLogManager, create_blob_fake_resource +from dagster_azure.blob import AzureBlobComputeLogManager, FakeBlobServiceClient from dagster import DagsterEventType, execute_pipeline, pipeline, seven, solid from dagster.core.instance import DagsterInstance, InstanceType @@ -27,7 +27,7 @@ def test_compute_log_manager( mock_create_blob_client, mock_generate_blob_sas, storage_account, container, credential ): mock_generate_blob_sas.return_value = 'fake-url' - fake_client = create_blob_fake_resource(storage_account) + fake_client = FakeBlobServiceClient(storage_account) mock_create_blob_client.return_value = fake_client @pipeline From 7b3fa44f7b065220e8245ca605f471ccbe4606cd Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 21 May 2020 22:35:08 +0100 Subject: [PATCH 03/21] Add ADLS2Resource to wrap ADLS2/Blob clients --- .../dagster_azure/adls2/__init__.py | 3 +- .../adls2/fake_adls2_resource.py | 154 +++++++++++++++++ .../dagster_azure/adls2/intermediate_store.py | 10 +- .../dagster_azure/adls2/object_store.py | 17 +- .../dagster_azure/adls2/resources.py | 23 ++- .../dagster_azure/adls2/system_storage.py | 9 +- .../dagster_azure/blob/__init__.py | 3 +- .../dagster_azure/blob/fake_blob_client.py | 159 ++++++++++++++++++ .../adls2_tests/test_adls2_file_manager.py | 25 +-- .../adls2_tests/test_intermediate_store.py | 50 ++++-- .../adls2_tests/test_object_store.py | 10 +- 11 files changed, 410 insertions(+), 53 deletions(-) create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py create mode 100644 python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py index dd43d9597b337..f6e81358fcb40 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/__init__.py @@ -1,4 +1,4 @@ -from .fake_adls2_resource import FakeADLS2ServiceClient +from .fake_adls2_resource import FakeADLS2Resource, FakeADLS2ServiceClient from .file_cache import ADLS2FileCache, adls2_file_cache from .file_manager import ADLS2FileHandle, ADLS2FileManager from .intermediate_store import ADLS2IntermediateStore @@ -7,5 +7,4 @@ from .system_storage import adls2_plus_default_storage_defs, adls2_system_storage from .utils import create_adls2_client -# from .compute_log_manager import ADLS2ComputeLogManager # from .solids import ADLS2Coordinate, file_handle_to_adls2 diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py new file mode 100644 index 0000000000000..1e27f388f96db --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py @@ -0,0 +1,154 @@ +from collections import defaultdict +from contextlib import contextmanager +import io +import random + +from azure.core.exceptions import ResourceNotFoundError + +from dagster.seven import mock + +from dagster_azure.blob import FakeBlobServiceClient +from .resources import ADLS2Resource + + +class FakeADLS2Resource(ADLS2Resource): + '''Stateful mock of an ADLS2Resource for testing. + + Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict. + ''' + + def __init__( + self, account_name, credential='fake-creds' + ): # pylint: disable=unused-argument,super-init-not-called + self._adls2_client = FakeADLS2ServiceClient(account_name) + self._blob_client = FakeBlobServiceClient(account_name) + + +class FakeADLS2ServiceClient(object): + '''Stateful mock of an ADLS2 service client for testing. + + Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict. + ''' + + def __init__(self, account_name, credential='fake-creds'): + + self._account_name = account_name + self._credential = mock.MagicMock() + self._credential.account_key = credential + self._file_systems = {} + + @property + def account_name(self): + return self._account_name + + @property + def credential(self): + return self._credential + + @property + def file_systems(self): + return self._file_systems + + def get_file_system_client(self, file_system): + return self._file_systems.setdefault( + file_system, FakeADLS2FilesystemClient(self.account_name, file_system) + ) + + def get_file_client(self, file_system, file_path): + return self.get_file_system_client(file_system).get_file_client(file_path) + + +class FakeADLS2FilesystemClient(object): + '''Stateful mock of an ADLS2 filesystem client for testing.''' + + def __init__(self, account_name, file_system_name): + self._file_system = defaultdict(FakeADLS2FileClient) + self._account_name = account_name + self._file_system_name = file_system_name + + @property + def account_name(self): + return self._account_name + + @property + def file_system_name(self): + return self._file_system_name + + def keys(self): + return self._file_system.keys() + + def get_file_system_properties(self): + return {"account_name": self.account_name, "file_system_name": self.file_system_name} + + def has_file(self, path): + return bool(self._file_system.get(path)) + + def get_file_client(self, file_path): + return self._file_system[file_path] + + def create_file(self, file): + return self._file_system[file] + + def delete_file(self, file): + for k in list(self._file_system.keys()): + if k.startswith(file): + del self._file_system[k] + + +class FakeADLS2FileClient(object): + '''Stateful mock of an ADLS2 file client for testing.''' + + def __init__(self): + self.contents = None + self.lease = None + + def get_file_properties(self): + if self.contents is None: + raise ResourceNotFoundError("File does not exist!") + return {"lease": self.lease} + + def upload_data(self, contents, overwrite=False, lease=None): + if self.lease is not None: + if lease != self.lease: + raise Exception("Invalid lease!") + if self.contents is not None or overwrite is True: + if isinstance(contents, str): + self.contents = contents.encode('utf8') + elif isinstance(contents, io.BytesIO): + self.contents = contents.read() + elif isinstance(contents, io.StringIO): + self.contents = contents.read().encode('utf8') + elif isinstance(contents, bytes): + self.contents = contents + else: + print("Uploading unknown data") + self.contents = contents + + @contextmanager + def acquire_lease(self, lease_duration=-1): # pylint: disable=unused-argument + if self.lease is None: + self.lease = random.randint(0, 2 ** 9) + try: + yield self.lease + finally: + self.lease = None + else: + raise Exception("Lease already held") + + def download_file(self): + if self.contents is None: + raise ResourceNotFoundError("File does not exist!") + return FakeADLS2FileDownloader(contents=self.contents) + + +class FakeADLS2FileDownloader(object): + '''Mock of an ADLS2 file downloader for testing.''' + + def __init__(self, contents): + self.contents = contents + + def readall(self): + return self.contents + + def readinto(self, fileobj): + fileobj.write(self.contents) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/intermediate_store.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/intermediate_store.py index 2c2fc5cb0610c..b4328b56a7a99 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/intermediate_store.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/intermediate_store.py @@ -13,13 +13,19 @@ class ADLS2IntermediateStore(IntermediateStore): ''' def __init__( - self, file_system, run_id, client, type_storage_plugin_registry=None, prefix='dagster', + self, + file_system, + run_id, + adls2_client, + blob_client, + type_storage_plugin_registry=None, + prefix='dagster', ): check.str_param(file_system, 'file_system') check.str_param(prefix, 'prefix') check.str_param(run_id, 'run_id') - object_store = ADLS2ObjectStore(file_system, client) + object_store = ADLS2ObjectStore(file_system, adls2_client, blob_client) def root_for_run_id(r_id): return object_store.key_for_paths([prefix, 'storage', r_id]) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py index 4856513e2eb2c..f05b087259327 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py @@ -9,25 +9,18 @@ from dagster.core.definitions.events import ObjectStoreOperation, ObjectStoreOperationType from dagster.core.storage.object_store import ObjectStore from dagster.core.types.marshal import SerializationStrategy -from dagster.seven import urlparse - -from dagster_azure.blob.utils import create_blob_client DEFAULT_LEASE_DURATION = 60 # One minute class ADLS2ObjectStore(ObjectStore): - def __init__(self, file_system, client, lease_duration=DEFAULT_LEASE_DURATION): - self.adls2_client = client + def __init__( + self, file_system, adls2_client, blob_client, lease_duration=DEFAULT_LEASE_DURATION + ): + self.adls2_client = adls2_client self.file_system_client = self.adls2_client.get_file_system_client(file_system) # We also need a blob client to handle copying as ADLS doesn't have a copy API yet - self.blob_client = create_blob_client( - client.account_name, - # client.credential is non-null if a secret key was used to authenticate - client.credential.account_key if client.credential is not None - # otherwise the SAS token will be in the query string of the URL - else urlparse(client.url).query, - ) + self.blob_client = blob_client self.blob_container_client = self.blob_client.get_container_client(file_system) self.lease_duration = lease_duration diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py index 7a51a4b4adb6d..0584244e038ef 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py @@ -1,5 +1,6 @@ from dagster import Field, Selector, String, StringSource, resource +from dagster_azure.blob.utils import create_blob_client from .utils import create_adls2_client @@ -73,4 +74,24 @@ def example_adls2_solid(context): ''' storage_account = context.resource_config['storage_account'] credential = context.resource_config["credential"].copy().popitem()[1] - return create_adls2_client(storage_account, credential) + return ADLS2Resource(storage_account, credential) + + +class ADLS2Resource(object): + '''Resource containing clients to access Azure Data Lake Storage Gen2. + + Contains a client for both the Data Lake and Blob APIs, to work around the limitations + of each. + ''' + + def __init__(self, storage_account, credential): + self._adls2_client = create_adls2_client(storage_account, credential) + self._blob_client = create_blob_client(storage_account, credential) + + @property + def adls2_client(self): + return self._adls2_client + + @property + def blob_client(self): + return self._blob_client diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/system_storage.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/system_storage.py index edde88d383c7f..35fa7c996d20d 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/system_storage.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/system_storage.py @@ -49,23 +49,24 @@ def adls2_system_storage(init_context): adls2_file_system: my-cool-file-system adls2_prefix: good/prefix-for-files- ''' - client = init_context.resources.adls2 + resource = init_context.resources.adls2 adls2_base = '{prefix}/storage/{run_id}/files'.format( prefix=init_context.system_storage_config['adls2_prefix'], run_id=init_context.pipeline_run.run_id, ) return SystemStorageData( file_manager=ADLS2FileManager( - adls2_client=client, + adls2_client=resource.adls2_client, file_system=init_context.system_storage_config['adls2_file_system'], prefix=adls2_base, ), intermediates_manager=IntermediateStoreIntermediatesManager( ADLS2IntermediateStore( - client=client, file_system=init_context.system_storage_config['adls2_file_system'], - prefix=init_context.system_storage_config['adls2_prefix'], run_id=init_context.pipeline_run.run_id, + adls2_client=resource.adls2_client, + blob_client=resource.blob_client, + prefix=init_context.system_storage_config['adls2_prefix'], type_storage_plugin_registry=init_context.type_storage_plugin_registry, ) ), diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py index 5add15da4b006..45bcca377bbe0 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/__init__.py @@ -1,2 +1,3 @@ -from .fake_blob_client import FakeBlobServiceClient from .compute_log_manager import AzureBlobComputeLogManager +from .fake_blob_client import FakeBlobServiceClient +from .utils import create_blob_client diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py new file mode 100644 index 0000000000000..31dcea2036b70 --- /dev/null +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py @@ -0,0 +1,159 @@ +from collections import defaultdict +from contextlib import contextmanager +import io +import random + +from azure.core.exceptions import ResourceNotFoundError + +from dagster.seven import mock + + +class FakeBlobServiceClient(object): + '''Stateful mock of an Blob service client for testing. + + Wraps a ``mock.MagicMock``. Containers are implemented using an in-memory dict. + ''' + + def __init__(self, account_name, credential='fake-creds'): + + self._account_name = account_name + self._credential = mock.MagicMock() + self._credential.account_key = credential + self._containers = {} + + @property + def account_name(self): + return self._account_name + + @property + def credential(self): + return self._credential + + @property + def containers(self): + return self._containers + + def get_container_client(self, container): + return self._containers.setdefault( + container, FakeBlobContainerClient(self.account_name, container) + ) + + def get_blob_client(self, container, blob): + return self.get_container_client(container).get_blob_client(blob) + + +class FakeBlobContainerClient(object): + '''Stateful mock of an Blob container client for testing.''' + + def __init__(self, account_name, container_name): + self._container = defaultdict(FakeBlobClient) + self._account_name = account_name + self._container_name = container_name + + @property + def account_name(self): + return self._account_name + + @property + def container_name(self): + return self._container_name + + def keys(self): + return self._container.keys() + + def get_container_properties(self): + return {"account_name": self.account_name, "container_name": self.container_name} + + def has_blob(self, path): + return bool(self._container.get(path)) + + def get_blob_client(self, blob): + return self._container[blob] + + def create_blob(self, blob): + return self._container[blob] + + def list_blobs(self, name_starts_with=None): + for k, v in self._container.items(): + if name_starts_with is None or k.startswith(name_starts_with): + yield { + 'name': k, + # This clearly isn't actually the URL but we need a way of copying contents + # across blobs and this allows us to do it + 'url': v.contents, + } + + def delete_blob(self, blob): + # Use list to avoid mutating dict as we iterate + for k in list(self._container.keys()): + if k.startswith(blob): + del self._container[k] + + +class FakeBlobClient(object): + '''Stateful mock of an Blob blob client for testing.''' + + def __init__(self): + self.contents = None + self.lease = None + + def start_copy_from_url(self, url): + self.contents = url + + def get_blob_properties(self): + if self.contents is None: + raise ResourceNotFoundError("File does not exist!") + return {"lease": self.lease} + + def upload_blob(self, contents, overwrite=False, lease=None): + if self.lease is not None: + if lease != self.lease: + raise Exception("Invalid lease!") + if self.contents is None or overwrite is True: + if isinstance(contents, str): + self.contents = contents.encode('utf8') + elif isinstance(contents, io.TextIOBase): + self.contents = contents.read().encode('utf8') + elif isinstance(contents, io.IOBase): + self.contents = contents.read() + elif isinstance(contents, bytes): + self.contents = contents + # Python 2 compatibility - no base class for `file` type + elif hasattr(contents, 'read'): + self.contents = contents.read() + else: + print("Uploading unknown data") + self.contents = contents + + @property + def url(self): + return ':memory:' + + @contextmanager + def acquire_lease(self, lease_duration=-1): # pylint: disable=unused-argument + if self.lease is None: + self.lease = random.randint(0, 2 ** 9) + try: + yield self.lease + finally: + self.lease = None + else: + raise Exception("Lease already held") + + def download_blob(self): + if self.contents is None: + raise ResourceNotFoundError("File does not exist!") + return FakeBlobDownloader(contents=self.contents) + + +class FakeBlobDownloader(object): + '''Mock of a Blob file downloader for testing.''' + + def __init__(self, contents): + self.contents = contents + + def readall(self): + return self.contents + + def readinto(self, fileobj): + fileobj.write(self.contents) diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py index b379a73f89eeb..d5b940d3543e4 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_manager.py @@ -3,10 +3,9 @@ from dagster_azure.adls2 import ( ADLS2FileHandle, ADLS2FileManager, - FakeADLS2ServiceClient, + FakeADLS2Resource, adls2_plus_default_storage_defs, ) -from dagster_azure.blob import FakeBlobServiceClient from dagster import ( InputDefinition, @@ -91,10 +90,7 @@ def get_file_client(self, *_args, **kwargs): file_manager.delete_local_temp() -@mock.patch('dagster_azure.adls2.object_store.create_blob_client') -def test_depends_on_adls2_resource_intermediates( - mock_create_blob_client, storage_account, file_system -): +def test_depends_on_adls2_resource_intermediates(storage_account, file_system): @solid( input_defs=[InputDefinition('num_one', Int), InputDefinition('num_two', Int)], output_defs=[OutputDefinition(Int)], @@ -102,8 +98,7 @@ def test_depends_on_adls2_resource_intermediates( def add_numbers(_, num_one, num_two): return num_one + num_two - mock_create_blob_client.return_value = FakeBlobServiceClient(storage_account) - adls2_fake_resource = FakeADLS2ServiceClient(storage_account) + adls2_fake_resource = FakeADLS2Resource(storage_account) @pipeline( mode_defs=[ @@ -129,13 +124,13 @@ def adls2_internal_pipeline(): assert result.success assert result.result_for_solid('add_numbers').output_value() == 6 - assert file_system in adls2_fake_resource.file_systems + assert file_system in adls2_fake_resource.adls2_client.file_systems keys = set() for step_key, output_name in [('add_numbers.compute', 'result')]: keys.add(create_adls2_key(result.run_id, step_key, output_name)) - assert set(adls2_fake_resource.file_systems[file_system].keys()) == keys + assert set(adls2_fake_resource.adls2_client.file_systems[file_system].keys()) == keys def create_adls2_key(run_id, step_key, output_name): @@ -144,10 +139,7 @@ def create_adls2_key(run_id, step_key, output_name): ) -@mock.patch('dagster_azure.adls2.object_store.create_blob_client') -def test_depends_on_adls2_resource_file_manager( - mock_create_blob_client, storage_account, file_system -): +def test_depends_on_adls2_resource_file_manager(storage_account, file_system): bar_bytes = 'bar'.encode() @solid(output_defs=[OutputDefinition(ADLS2FileHandle)]) @@ -160,8 +152,7 @@ def accept_file(context, file_handle): assert isinstance(local_path, str) assert open(local_path, 'rb').read() == bar_bytes - mock_create_blob_client.return_value = FakeBlobServiceClient(storage_account) - adls2_fake_resource = FakeADLS2ServiceClient(storage_account) + adls2_fake_resource = FakeADLS2Resource(storage_account) @pipeline( mode_defs=[ @@ -181,7 +172,7 @@ def adls2_file_manager_test(): assert result.success - keys_in_bucket = set(adls2_fake_resource.file_systems[file_system].keys()) + keys_in_bucket = set(adls2_fake_resource.adls2_client.file_systems[file_system].keys()) for step_key, output_name in [ ('emit_file.compute', 'result'), diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py index a93f874228352..2ae3df72f2cc4 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py @@ -9,6 +9,7 @@ adls2_resource, create_adls2_client, ) +from dagster_azure.blob import create_blob_client from dagster import ( Bool, @@ -102,11 +103,16 @@ def get_azure_credential(): raise Exception("AZURE_STORAGE_ACCOUNT_KEY must be set for intermediate store tests") -def get_client(storage_account): +def get_adls2_client(storage_account): creds = get_azure_credential()["key"] return create_adls2_client(storage_account, creds) +def get_blob_client(storage_account): + creds = get_azure_credential()["key"] + return create_blob_client(storage_account, creds) + + @nettest def test_using_adls2_for_subplan(storage_account, file_system): pipeline_def = define_inty_pipeline() @@ -149,10 +155,12 @@ def test_using_adls2_for_subplan(storage_account, file_system): instance, ) as context: + resource = context.scoped_resources_builder.build(required_resource_keys={'adls2'}).adls2 store = ADLS2IntermediateStore( file_system=file_system, run_id=run_id, - client=context.scoped_resources_builder.build(required_resource_keys={'adls2'}).adls2, + adls2_client=resource.adls2_client, + blob_client=resource.blob_client, ) intermediates_manager = IntermediateStoreIntermediatesManager(store) step_output_handle = StepOutputHandle('return_one.compute') @@ -206,7 +214,8 @@ def test_adls2_intermediate_store_with_type_storage_plugin(storage_account, file run_id = make_new_run_id() intermediate_store = ADLS2IntermediateStore( - client=get_client(storage_account), + adls2_client=get_adls2_client(storage_account), + blob_client=get_blob_client(storage_account), run_id=run_id, file_system=file_system, type_storage_plugin_registry=TypeStoragePluginRegistry( @@ -230,7 +239,8 @@ def test_adls2_intermediate_store_with_composite_type_storage_plugin(storage_acc run_id = make_new_run_id() intermediate_store = ADLS2IntermediateStore( - client=get_client(storage_account), + adls2_client=get_adls2_client(storage_account), + blob_client=get_blob_client(storage_account), run_id=run_id, file_system=file_system, type_storage_plugin_registry=TypeStoragePluginRegistry( @@ -251,7 +261,10 @@ def test_adls2_intermediate_store_composite_types_with_custom_serializer_for_inn run_id = make_new_run_id() intermediate_store = ADLS2IntermediateStore( - client=get_client(storage_account), run_id=run_id, file_system=file_system, + adls2_client=get_adls2_client(storage_account), + blob_client=get_blob_client(storage_account), + run_id=run_id, + file_system=file_system, ) obj_name = 'list' @@ -275,7 +288,10 @@ def test_adls2_intermediate_store_with_custom_serializer(storage_account, file_s run_id = make_new_run_id() intermediate_store = ADLS2IntermediateStore( - client=get_client(storage_account), run_id=run_id, file_system=file_system, + adls2_client=get_adls2_client(storage_account), + blob_client=get_blob_client(storage_account), + run_id=run_id, + file_system=file_system, ) with yield_empty_pipeline_context(run_id=run_id) as context: @@ -324,11 +340,13 @@ def test_adls2_pipeline_with_custom_prefix(storage_account, file_system): with scoped_pipeline_context( execution_plan, environment_dict, pipeline_run, instance, ) as context: + resource = context.scoped_resources_builder.build(required_resource_keys={'adls2'}).adls2 store = ADLS2IntermediateStore( run_id=result.run_id, file_system=file_system, prefix=adls2_prefix, - client=context.scoped_resources_builder.build(required_resource_keys={'adls2'}).adls2, + adls2_client=resource.adls2_client, + blob_client=resource.blob_client, ) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert store.root == '/'.join(['custom_prefix', 'storage', result.run_id]) @@ -351,7 +369,8 @@ def test_adls2_intermediate_store_with_custom_prefix(storage_account, file_syste run_id = make_new_run_id() intermediate_store = ADLS2IntermediateStore( - client=get_client(storage_account), + adls2_client=get_adls2_client(storage_account), + blob_client=get_blob_client(storage_account), run_id=run_id, file_system=file_system, prefix='custom_prefix', @@ -380,12 +399,18 @@ def test_adls2_intermediate_store(storage_account, file_system): run_id_2 = make_new_run_id() intermediate_store = ADLS2IntermediateStore( - client=get_client(storage_account), run_id=run_id, file_system=file_system, + adls2_client=get_adls2_client(storage_account), + blob_client=get_blob_client(storage_account), + run_id=run_id, + file_system=file_system, ) assert intermediate_store.root == '/'.join(['dagster', 'storage', run_id]) intermediate_store_2 = ADLS2IntermediateStore( - client=get_client(storage_account), run_id=run_id_2, file_system=file_system, + adls2_client=get_adls2_client(storage_account), + blob_client=get_blob_client(storage_account), + run_id=run_id_2, + file_system=file_system, ) assert intermediate_store_2.root == '/'.join(['dagster', 'storage', run_id_2]) @@ -438,7 +463,10 @@ def test_custom_read_write_mode(storage_account, file_system): try: with yield_empty_pipeline_context(run_id=run_id) as context: intermediate_store = ADLS2IntermediateStore( - client=get_client(storage_account), run_id=run_id, file_system=file_system, + adls2_client=get_adls2_client(storage_account), + blob_client=get_blob_client(storage_account), + run_id=run_id, + file_system=file_system, ) intermediate_store.set_object( data_frame, context, resolve_dagster_type(LessSimpleDataFrame), ['data_frame'] diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py index e111bedadeb5d..3a256406c2fbb 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py @@ -1,16 +1,20 @@ -from dagster_azure.adls2 import ADLS2ObjectStore, FakeADLS2ServiceClient - from dagster.core.storage.object_store import DEFAULT_SERIALIZATION_STRATEGY +from dagster_azure.adls2 import ADLS2ObjectStore, FakeADLS2ServiceClient +from dagster_azure.blob import FakeBlobServiceClient + def test_adls2_object_store( storage_account, credential, file_system, caplog ): # pylint: disable=too-many-function-args adls2_fake_client = FakeADLS2ServiceClient(storage_account, credential) + blob_fake_client = FakeBlobServiceClient(storage_account, credential) key = 'foo' # Uses mock ADLS2 client - adls2_obj_store = ADLS2ObjectStore(file_system, client=adls2_fake_client) + adls2_obj_store = ADLS2ObjectStore( + file_system, adls2_client=adls2_fake_client, blob_client=blob_fake_client + ) res = adls2_obj_store.set_object(key, True, DEFAULT_SERIALIZATION_STRATEGY) assert res.key == 'abfss://{fs}@{account}.dfs.core.windows.net/{key}'.format( fs=file_system, account=storage_account, key=key From bab4fd8020609adbcf5216151194fae3e297e590 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Fri, 22 May 2020 09:05:50 +0100 Subject: [PATCH 04/21] Fix import order in dagster-azure --- .../dagster_azure/adls2/fake_adls2_resource.py | 6 +++--- .../dagster-azure/dagster_azure/adls2/file_cache.py | 2 +- .../dagster-azure/dagster_azure/adls2/resources.py | 3 ++- .../dagster-azure/dagster_azure/blob/compute_log_manager.py | 1 + .../dagster-azure/dagster_azure/blob/fake_blob_client.py | 4 ++-- .../adls2_tests/test_adls2_file_cache.py | 2 +- .../dagster_azure_tests/adls2_tests/test_object_store.py | 4 ++-- 7 files changed, 12 insertions(+), 10 deletions(-) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py index 1e27f388f96db..75b9eae2b1cd4 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py @@ -1,13 +1,13 @@ -from collections import defaultdict -from contextlib import contextmanager import io import random +from collections import defaultdict +from contextlib import contextmanager from azure.core.exceptions import ResourceNotFoundError +from dagster_azure.blob import FakeBlobServiceClient from dagster.seven import mock -from dagster_azure.blob import FakeBlobServiceClient from .resources import ADLS2Resource diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py index 6468039dd86cc..f161c45b6d55b 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py @@ -1,6 +1,6 @@ from azure.core.exceptions import ResourceNotFoundError -from dagster import Field, String, StringSource, Selector, check, resource +from dagster import Field, Selector, String, StringSource, check, resource from dagster.core.storage.file_cache import FileCache from .file_manager import ADLS2FileHandle diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py index 0584244e038ef..f77073d45ea59 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/resources.py @@ -1,6 +1,7 @@ +from dagster_azure.blob.utils import create_blob_client + from dagster import Field, Selector, String, StringSource, resource -from dagster_azure.blob.utils import create_blob_client from .utils import create_adls2_client diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py index adf53296dfe06..e753a5033bad0 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py @@ -3,6 +3,7 @@ from contextlib import contextmanager from azure.storage.blob import generate_blob_sas + from dagster import Field, check, seven from dagster.core.storage.compute_log_manager import ( MAX_BYTES_FILE_READ, diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py index 31dcea2036b70..ceef428ef2338 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py @@ -1,7 +1,7 @@ -from collections import defaultdict -from contextlib import contextmanager import io import random +from collections import defaultdict +from contextlib import contextmanager from azure.core.exceptions import ResourceNotFoundError diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py index eae06d53ab9c1..a14f315afacec 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_adls2_file_cache.py @@ -1,6 +1,6 @@ import io -from dagster_azure.adls2 import FakeADLS2ServiceClient, ADLS2FileCache, ADLS2FileHandle +from dagster_azure.adls2 import ADLS2FileCache, ADLS2FileHandle, FakeADLS2ServiceClient def test_adls2_file_cache_file_not_present(storage_account, file_system, credential): diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py index 3a256406c2fbb..7d3d63cbbf638 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_object_store.py @@ -1,8 +1,8 @@ -from dagster.core.storage.object_store import DEFAULT_SERIALIZATION_STRATEGY - from dagster_azure.adls2 import ADLS2ObjectStore, FakeADLS2ServiceClient from dagster_azure.blob import FakeBlobServiceClient +from dagster.core.storage.object_store import DEFAULT_SERIALIZATION_STRATEGY + def test_adls2_object_store( storage_account, credential, file_system, caplog From d179eeffefdaaddc99b4dc16f6c3a67d1451b10f Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Fri, 22 May 2020 09:11:05 +0100 Subject: [PATCH 05/21] Add dagster-azure to install_dev_python_modules make target --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index cf4b0084a3f92..61849c65d8015 100644 --- a/Makefile +++ b/Makefile @@ -40,6 +40,7 @@ install_dev_python_modules: -e python_modules/dagit \ -e python_modules/libraries/dagster-pandas \ -e python_modules/libraries/dagster-aws \ + -e python_modules/libraries/dagster-azure \ -e python_modules/libraries/dagster-bash \ -e python_modules/libraries/dagster-celery \ -e python_modules/libraries/dagster-cron \ From af9f48621b63ff369f6ebe448c8b9d0a6ad9054d Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Fri, 22 May 2020 09:44:05 +0100 Subject: [PATCH 06/21] Include azure-storage-blob in dagster-azure requirements --- python_modules/libraries/dagster-azure/setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python_modules/libraries/dagster-azure/setup.py b/python_modules/libraries/dagster-azure/setup.py index 040c2a539e816..70a2152a11a11 100644 --- a/python_modules/libraries/dagster-azure/setup.py +++ b/python_modules/libraries/dagster-azure/setup.py @@ -38,7 +38,11 @@ def _do_setup(name='dagster-azure'): ], packages=find_packages(exclude=['test']), include_package_data=True, - install_requires=['azure-storage-file-datalake~=12.0.1', 'dagster'], + install_requires=[ + 'azure-storage-blob~=12.3.0', + 'azure-storage-file-datalake~=12.0.1', + 'dagster', + ], entry_points={'console_scripts': ['dagster-azure = dagster_azure.cli.cli:main']}, zip_safe=False, ) From 1f44d490c1336950201b3fe9876927b77c1ca70c Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Sun, 24 May 2020 13:48:45 +0100 Subject: [PATCH 07/21] Remove unused variable in tests --- .../dagster_azure_tests/adls2_tests/test_intermediate_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py index 2ae3df72f2cc4..b458fd7da6ee9 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/test_intermediate_store.py @@ -99,7 +99,7 @@ def get_step_output(step_events, step_key, output_name='result'): def get_azure_credential(): try: return {'key': os.environ["AZURE_STORAGE_ACCOUNT_KEY"]} - except KeyError as err: + except KeyError: raise Exception("AZURE_STORAGE_ACCOUNT_KEY must be set for intermediate store tests") From 42998a68fa373a65c743145b4113e40830b6af34 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 28 May 2020 09:13:00 +0100 Subject: [PATCH 08/21] Don't install dagster-azure as part of install_dev_python_modules make target --- Makefile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 61849c65d8015..7a6b3f1064a93 100644 --- a/Makefile +++ b/Makefile @@ -40,7 +40,6 @@ install_dev_python_modules: -e python_modules/dagit \ -e python_modules/libraries/dagster-pandas \ -e python_modules/libraries/dagster-aws \ - -e python_modules/libraries/dagster-azure \ -e python_modules/libraries/dagster-bash \ -e python_modules/libraries/dagster-celery \ -e python_modules/libraries/dagster-cron \ @@ -67,6 +66,15 @@ install_dev_python_modules: -e examples[full] \ -r scala_modules/scripts/requirements.txt $(QUIET) + + # Don't install dagster-azure as part of this target _yet_ - it has a dependency + # conflict with dagster-snowflake which causes any import of dagster-snowflake to + # fail with an ImportError (e.g. in examples). + # Uncomment only when snowflake-connector-python can be installed with optional (or compatible) + # Azure dependencies. + # See https://github.com/dagster-io/dagster/pull/2483#issuecomment-635174157 + # pip install -e python_modules/libraries/dagster-azure $(QUIET) + SLUGIFY_USES_TEXT_UNIDECODE=yes pip install -e python_modules/libraries/dagster-airflow $(QUIET) # NOTE: These installations will fail for Python 2.7 (Flyte and Dask don't work w/ py27) From 9a84309a0a88a1db06242903e7742269478d270d Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 28 May 2020 09:13:44 +0100 Subject: [PATCH 09/21] Remove accidentally committed Azure Blob object/intermediate store implementations These work but have no immediate use case and no tests, so seem like an unnecessary maintenance burden. This commit can be reverted if they're needed! --- .../dagster_azure/blob/intermediate_store.py | 41 ------ .../dagster_azure/blob/object_store.py | 138 ------------------ 2 files changed, 179 deletions(-) delete mode 100644 python_modules/libraries/dagster-azure/dagster_azure/blob/intermediate_store.py delete mode 100644 python_modules/libraries/dagster-azure/dagster_azure/blob/object_store.py diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/intermediate_store.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/intermediate_store.py deleted file mode 100644 index 603f3d6863ce3..0000000000000 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/intermediate_store.py +++ /dev/null @@ -1,41 +0,0 @@ -from dagster import check -from dagster.core.storage.intermediate_store import IntermediateStore -from dagster.core.storage.type_storage import TypeStoragePluginRegistry - -from .object_store import AzureBlobObjectStore - - -class AzureBlobIntermediateStore(IntermediateStore): - '''Intermediate store using Azure Blob storage. - - If your storage account has the ADLS Gen2 hierarchical namespace enabled - this should still work, but it is recommended to use the - :py:class:`~dagster_azure.adls2.intermediate_store.ADLS2IntermediateStore` - instead, which will enable some optimizations for certain types (notably - PySpark DataFrames). - ''' - - def __init__( - self, container, run_id, client, type_storage_plugin_registry=None, prefix='dagster', - ): - check.str_param(container, 'container') - check.str_param(prefix, 'prefix') - check.str_param(run_id, 'run_id') - - object_store = AzureBlobObjectStore(container, client) - - def root_for_run_id(r_id): - return object_store.key_for_paths([prefix, 'storage', r_id]) - - super(AzureBlobIntermediateStore, self).__init__( - object_store, - root_for_run_id=root_for_run_id, - run_id=run_id, - type_storage_plugin_registry=check.inst_param( - type_storage_plugin_registry - if type_storage_plugin_registry - else TypeStoragePluginRegistry(types_to_register=[]), - 'type_storage_plugin_registry', - TypeStoragePluginRegistry, - ), - ) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/object_store.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/object_store.py deleted file mode 100644 index be6729cdc09b3..0000000000000 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/object_store.py +++ /dev/null @@ -1,138 +0,0 @@ -import logging -import re -import sys -from io import BytesIO, StringIO - -from azure.core.exceptions import ResourceNotFoundError - -from dagster import check -from dagster.core.definitions.events import ObjectStoreOperation, ObjectStoreOperationType -from dagster.core.storage.object_store import ObjectStore -from dagster.core.types.marshal import SerializationStrategy - -DEFAULT_LEASE_DURATION = 60 * 60 # One hour - - -class AzureBlobObjectStore(ObjectStore): - def __init__(self, container, client, lease_duration=DEFAULT_LEASE_DURATION): - self.blob_client = client - self.container_client = self.blob_client.get_container_client(container) - - self.lease_duration = lease_duration - self.container_client.get_container_properties() - super(AzureBlobObjectStore, self).__init__('azure-blob', sep='/') - - def set_object(self, key, obj, serialization_strategy=None): - check.str_param(key, 'key') - - logging.info('Writing Azure Blob object at: ' + self.uri_for_key(key)) - - # cannot check obj since could be arbitrary Python object - check.inst_param( - serialization_strategy, 'serialization_strategy', SerializationStrategy - ) # cannot be none here - - blob = self.container_client.create_blob(key) - with blob.acquire_lease(self.lease_duration) as lease: - with BytesIO() as bytes_io: - if serialization_strategy.write_mode == 'w' and sys.version_info >= (3, 0): - with StringIO() as string_io: - string_io = StringIO() - serialization_strategy.serialize(obj, string_io) - string_io.seek(0) - bytes_io.write(string_io.read().encode('utf-8')) - else: - serialization_strategy.serialize(obj, bytes_io) - bytes_io.seek(0) - blob.upload_blob(bytes_io, lease=lease, overwrite=True) - - return ObjectStoreOperation( - op=ObjectStoreOperationType.SET_OBJECT, - key=self.uri_for_key(key), - dest_key=None, - obj=obj, - serialization_strategy_name=serialization_strategy.name, - object_store_name=self.name, - ) - - def get_object(self, key, serialization_strategy=None): - check.str_param(key, 'key') - check.param_invariant(len(key) > 0, 'key') - check.inst_param( - serialization_strategy, 'serialization_strategy', SerializationStrategy - ) # cannot be none here - - # FIXME we need better error handling for object store - blob = self.container_client.download_blob(key) - obj = serialization_strategy.deserialize( - BytesIO(blob.readall()) - if serialization_strategy.read_mode == 'rb' - else StringIO(blob.readall().decode(serialization_strategy.encoding)) - ) - return ObjectStoreOperation( - op=ObjectStoreOperationType.GET_OBJECT, - key=self.uri_for_key(key), - dest_key=None, - obj=obj, - serialization_strategy_name=serialization_strategy.name, - object_store_name=self.name, - ) - - def has_object(self, key): - check.str_param(key, 'key') - check.param_invariant(len(key) > 0, 'key') - - try: - blob = self.container_client.get_blob_client(key) - blob.get_blob_properties() - return True - except ResourceNotFoundError: - return False - - def rm_object(self, key): - check.str_param(key, 'key') - check.param_invariant(len(key) > 0, 'key') - - for blob in self.container_client.list_blobs(key): - self.container_client.delete_blob(blob) - - return ObjectStoreOperation( - op=ObjectStoreOperationType.RM_OBJECT, - key=self.uri_for_key(key), - dest_key=None, - obj=None, - serialization_strategy_name=None, - object_store_name=self.name, - ) - - def cp_object(self, src, dst): - check.str_param(src, 'src') - check.str_param(dst, 'dst') - - # Manually recurse and copy anything that looks like a file. - for src_blob_properties in self.container_client.list_blobs(src): - # This is the only way I can find to identify a 'directory' - if src_blob_properties['content_settings'] is None: - # Ignore this blob - continue - src_blob = self.container_client.get_blob_client(src_blob_properties['name']) - dst_blob_path = re.sub(r'^{}'.format(src), dst, src_blob_properties['name']) - dst_blob = self.container_client.get_blob_client(dst_blob_path) - dst_blob.start_copy_from_url(src_blob.url) - - return ObjectStoreOperation( - op=ObjectStoreOperationType.CP_OBJECT, - key=self.uri_for_key(src), - dest_key=self.uri_for_key(dst), - object_store_name=self.name, - ) - - def uri_for_key(self, key, protocol=None): - check.str_param(key, 'key') - protocol = check.opt_str_param(protocol, 'protocol', default='https://') - return '{protocol}@{account}.blob.core.windows.net/{container}/{key}'.format( - protocol=protocol, - account=self.blob_client.account_name, - container=self.container_client.container_name, - key=key, - ) From d9cd2538070f1ddf425c5e664444ae35970db9bf Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 28 May 2020 09:15:11 +0100 Subject: [PATCH 10/21] Wrap potentially incompatible imports to add a custom warning This centralizes the various azure-storage/azure-core imports and wraps them, plus the snowflake-connector import, in a try/except block, adding a custom warning with a suggested solution if the import fails. --- .../dagster_azure/adls2/fake_adls2_resource.py | 2 +- .../dagster_azure/adls2/file_cache.py | 4 +--- .../dagster_azure/adls2/object_store.py | 4 ++-- .../dagster-azure/dagster_azure/adls2/utils.py | 17 ++++++++++++++++- .../dagster_azure/blob/compute_log_manager.py | 4 +--- .../dagster_azure/blob/fake_blob_client.py | 4 ++-- .../dagster-azure/dagster_azure/blob/utils.py | 17 ++++++++++++++++- .../dagster_snowflake/resources.py | 14 +++++++++++++- 8 files changed, 52 insertions(+), 14 deletions(-) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py index 75b9eae2b1cd4..70ef2a2219371 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/fake_adls2_resource.py @@ -3,12 +3,12 @@ from collections import defaultdict from contextlib import contextmanager -from azure.core.exceptions import ResourceNotFoundError from dagster_azure.blob import FakeBlobServiceClient from dagster.seven import mock from .resources import ADLS2Resource +from .utils import ResourceNotFoundError class FakeADLS2Resource(ADLS2Resource): diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py index f161c45b6d55b..1a33454e4913b 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/file_cache.py @@ -1,10 +1,8 @@ -from azure.core.exceptions import ResourceNotFoundError - from dagster import Field, Selector, String, StringSource, check, resource from dagster.core.storage.file_cache import FileCache from .file_manager import ADLS2FileHandle -from .utils import create_adls2_client +from .utils import ResourceNotFoundError, create_adls2_client class ADLS2FileCache(FileCache): diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py index f05b087259327..2a0e27fae3e27 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/object_store.py @@ -3,13 +3,13 @@ import sys from io import BytesIO, StringIO -from azure.core.exceptions import ResourceNotFoundError - from dagster import check from dagster.core.definitions.events import ObjectStoreOperation, ObjectStoreOperationType from dagster.core.storage.object_store import ObjectStore from dagster.core.types.marshal import SerializationStrategy +from .utils import ResourceNotFoundError + DEFAULT_LEASE_DURATION = 60 # One minute diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py index 927bde23c10a7..94278d8dc8dd0 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py @@ -1,4 +1,19 @@ -from azure.storage.filedatalake import DataLakeServiceClient +import warnings + +try: + # Centralise Azure imports here so we only need to warn in one place + from azure.core.exceptions import ResourceNotFoundError + from azure.storage.filedatalake import DataLakeServiceClient +except ImportError: + msg = ( + "Could not import required Azure objects. This probably means you have an old version " + "of azure-storage-blob installed. dagster-azure requires azure-storage-blob~=12.0.0; " + "this conflicts with dagster-snowflake which requires azure-storage-blob<12.0.0 and is " + "incompatible. Please uninstall dagster-snowflake and reinstall dagster-azure to fix " + "this error." + ) + warnings.warn(msg) + raise def _create_url(storage_account, subdomain): diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py index e753a5033bad0..5eff1fe84839a 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/compute_log_manager.py @@ -2,8 +2,6 @@ import os from contextlib import contextmanager -from azure.storage.blob import generate_blob_sas - from dagster import Field, check, seven from dagster.core.storage.compute_log_manager import ( MAX_BYTES_FILE_READ, @@ -15,7 +13,7 @@ from dagster.serdes import ConfigurableClass, ConfigurableClassData from dagster.utils import ensure_dir, ensure_file -from .utils import create_blob_client +from .utils import create_blob_client, generate_blob_sas class AzureBlobComputeLogManager(ComputeLogManager, ConfigurableClass): diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py index ceef428ef2338..efc4dd7a8eda3 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/fake_blob_client.py @@ -3,10 +3,10 @@ from collections import defaultdict from contextlib import contextmanager -from azure.core.exceptions import ResourceNotFoundError - from dagster.seven import mock +from .utils import ResourceNotFoundError + class FakeBlobServiceClient(object): '''Stateful mock of an Blob service client for testing. diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py index 01f01d156fbe9..477c446348bbd 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py @@ -1,4 +1,19 @@ -from azure.storage.blob import BlobServiceClient +import warnings + +try: + # Centralise Azure imports here so we only need to warn in one place + from azure.core.exceptions import ResourceNotFoundError + from azure.storage.blob import generate_blob_sas, BlobServiceClient +except ImportError: + msg = ( + "Could not import required Azure objects. This probably means you have an old version " + "of azure-storage-blob installed. dagster-azure requires azure-storage-blob~=12.0.0; " + "this conflicts with dagster-snowflake which requires azure-storage-blob<12.0.0 and is " + "incompatible. Please uninstall dagster-snowflake and reinstall dagster-azure to fix " + "this error." + ) + warnings.warn(msg) + raise def _create_url(storage_account, subdomain): diff --git a/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py b/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py index 8b80afcb0030d..d8535a45f1240 100644 --- a/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py +++ b/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py @@ -1,7 +1,19 @@ import sys +import warnings from contextlib import closing, contextmanager -import snowflake.connector +try: + import snowflake.connector +except ImportError: + msg = ( + "Could not import snowflake.connector. This could mean you have an incompatible version " + "of azure-storage-blob installed. dagster-snowflake requires azure-storage-blob<12.0.0; " + "this conflicts with dagster-azure which requires azure-storage-blob~=12.0.0 and is " + "incompatible with dagster-snowflake. Please uninstall dagster-azure and reinstall " + "dagster-snowflake to fix this error." + ) + warnings.warn(msg) + raise from dagster import check, resource From ff2f81afc2fb2f0473d44855580166fafbbad118 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Thu, 28 May 2020 09:26:59 +0100 Subject: [PATCH 11/21] Add README to dagster-azure and note about incompatibility to dagster-snowflake's README --- python_modules/libraries/dagster-azure/README.md | 13 +++++++++++++ .../libraries/dagster-snowflake/README.md | 2 ++ 2 files changed, 15 insertions(+) create mode 100644 python_modules/libraries/dagster-azure/README.md diff --git a/python_modules/libraries/dagster-azure/README.md b/python_modules/libraries/dagster-azure/README.md new file mode 100644 index 0000000000000..7ff67917151aa --- /dev/null +++ b/python_modules/libraries/dagster-azure/README.md @@ -0,0 +1,13 @@ +# dagster-azure + +Utilities for using Azure Storage Accounts with Dagster. This is mostly aimed at Azure Data Lake Storage Gen 2 (ADLS2) but also contains some utilities for Azure Blob Storage. + +**This package is incompatible with dagster-snowflake!** This is due to a version mismatch between the underlying azure-storage-blob package (dagster-snowflake has a transitive dependency on an old version, via snowflake-connector-python). + +## Utilities + +- ADLS2 file cache (see the `dagster_azure.adls2.adls2_file_cache` resource) +- The `dagster_azure.adls2.adls2_resource` providing solids access to an ADLS2 client +- Persistent storage using ADLS2 (see `dagster_azure.adls2.adls2_system_storage`) +- Log management using Azure Blob Storage (also compatible with ADLS - see `dagster_azure.blob.AzureBlobComputeLogManager` +- Fake clients for use in tests (see `dagster_azure.adls2.FakeADLS2Resource`) diff --git a/python_modules/libraries/dagster-snowflake/README.md b/python_modules/libraries/dagster-snowflake/README.md index c99a0657e558f..c021e57497d9a 100644 --- a/python_modules/libraries/dagster-snowflake/README.md +++ b/python_modules/libraries/dagster-snowflake/README.md @@ -8,6 +8,8 @@ Presently, it provides two solids for interacting with Snowflake, `SnowflakeSoli Both of these solids depend on `snowflake_resource`, which is a Dagster resource for configuring Snowflake connections. +**This package is incompatible with dagster-azure!** This is due to a version mismatch between the underlying azure-storage-blob package (dagster-snowflake has a transitive dependency on an older version, via snowflake-connector-python); dagster-azure requires a new version of the Azure packages. + ## Getting Started To use this library, you should first ensure that you have an appropriate [Snowflake user](https://docs.snowflake.net/manuals/user-guide/admin-user-management.html) configured to access your data warehouse. From ddb9c3d5580989995c4cc82d45c96591e1256b9a Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Tue, 2 Jun 2020 09:19:53 +0100 Subject: [PATCH 12/21] Isort --- .../dagster-snowflake/dagster_snowflake/resources.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py b/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py index d8535a45f1240..914808296e176 100644 --- a/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py +++ b/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py @@ -2,6 +2,10 @@ import warnings from contextlib import closing, contextmanager +from dagster import check, resource + +from .configs import define_snowflake_config + try: import snowflake.connector except ImportError: @@ -15,9 +19,7 @@ warnings.warn(msg) raise -from dagster import check, resource -from .configs import define_snowflake_config class SnowflakeConnection(object): From c8e068b0269d2473a938d026aea1371518bfa681 Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Tue, 2 Jun 2020 13:13:29 -0700 Subject: [PATCH 13/21] Set buildkite container for dagster-azure tests --- .../dagster_azure_tests/adls2_tests/conftest.py | 7 ++++--- .../blob_tests/test_compute_log_manager.py | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py index da13cc0c80694..155e498d533f3 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py @@ -1,16 +1,17 @@ +import os import pytest @pytest.fixture(scope='session') def storage_account(): - yield 'dagsterdatabrickstests' + yield 'elementldevstorage' @pytest.fixture(scope='session') def file_system(): - yield 'dagster-databricks-tests' + yield 'dagster-azure-tests' @pytest.fixture(scope='session') def credential(): - yield 'super-secret-creds' + yield os.environ.get('AZURE_STORAGE_ACCOUNT_KEY') diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py index deb9576a512b0..3097429ac7e8f 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/blob_tests/test_compute_log_manager.py @@ -6,6 +6,7 @@ from dagster import DagsterEventType, execute_pipeline, pipeline, seven, solid from dagster.core.instance import DagsterInstance, InstanceType +from dagster.core.launcher.sync_in_memory_run_launcher import SyncInMemoryRunLauncher from dagster.core.storage.compute_log_manager import ComputeIOType from dagster.core.storage.event_log import SqliteEventLogStorage from dagster.core.storage.root import LocalArtifactStorage @@ -56,6 +57,7 @@ def easy(context): run_storage=run_store, event_storage=event_store, compute_log_manager=manager, + run_launcher=SyncInMemoryRunLauncher(), ) result = execute_pipeline(simple, instance=instance) compute_steps = [ From 716d265bbfc3766aecf3f367dd5db2b131f19ce6 Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Tue, 2 Jun 2020 16:30:40 -0700 Subject: [PATCH 14/21] Env variables in buildkite for Azure --- .buildkite/pipeline.py | 4 ++++ .../dagster-azure/dagster_azure_tests/adls2_tests/conftest.py | 1 + 2 files changed, 5 insertions(+) diff --git a/.buildkite/pipeline.py b/.buildkite/pipeline.py index 2face1cb352f6..e4e1c6718275f 100644 --- a/.buildkite/pipeline.py +++ b/.buildkite/pipeline.py @@ -243,6 +243,10 @@ def postgres_extra_cmds_fn(_): # See: https://github.com/dagster-io/dagster/issues/1960 supported_pythons=SupportedPythonsNo38, ), + ModuleBuildSpec( + 'python_modules/libraries/dagster-azure', + env_vars=['AZURE_DATA_LAKE_STORAGE_KEY', 'AZURE_STORAGE_ACCOUNT_KEY'], + ), ModuleBuildSpec( 'python_modules/libraries/dagster-celery', env_vars=['AWS_ACCOUNT_ID', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'], diff --git a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py index 155e498d533f3..051189ed3724e 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py +++ b/python_modules/libraries/dagster-azure/dagster_azure_tests/adls2_tests/conftest.py @@ -1,4 +1,5 @@ import os + import pytest From bc07898c849ade33a4ba0bfe949f4d940bb32b5d Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Tue, 2 Jun 2020 16:58:28 -0700 Subject: [PATCH 15/21] Add dagster-azure package with various storage components Summary: This adds the following components based on Azure Data Lake Storage Gen2 (and Azure Blob Storage where appropriate): - ADLS2FileCache and adls2_file_cache - ADLS2FileManager - ADLS2IntermediateStore - ADLS2ObjectStore - the adls2_resource providing direct access to Azure Data Lake Storage - the adls2_system_storage system storage This is pretty similar to the S3 implementation, the main difference being configuration: Azure's SDK requires credentials to be passed explicitly, so the credential is expected in configuration. Tests currently require an access key to complete any tests marked 'nettest'. Rename Fake Azure classes and modules to more English-friendly names Add ADLS2Resource to wrap ADLS2/Blob clients Fix import order in dagster-azure Add dagster-azure to install_dev_python_modules make target Include azure-storage-blob in dagster-azure requirements Remove unused variable in tests Don't install dagster-azure as part of install_dev_python_modules make target Remove accidentally committed Azure Blob object/intermediate store implementations These work but have no immediate use case and no tests, so seem like an unnecessary maintenance burden. This commit can be reverted if they're needed! Wrap potentially incompatible imports to add a custom warning This centralizes the various azure-storage/azure-core imports and wraps them, plus the snowflake-connector import, in a try/except block, adding a custom warning with a suggested solution if the import fails. Add README to dagster-azure and note about incompatibility to dagster-snowflake's README Isort Set buildkite container for dagster-azure tests Merge pull request #1 from dagster-io/dagster-azure Set buildkite container for dagster-azure tests Env variables in buildkite for Azure Test Plan: bk Differential Revision: https://dagster.phacility.com/D3238 --- .../libraries/dagster-azure/dagster_azure/adls2/utils.py | 2 +- .../libraries/dagster-azure/dagster_azure/blob/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py index 94278d8dc8dd0..668c527ab087c 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py @@ -2,7 +2,7 @@ try: # Centralise Azure imports here so we only need to warn in one place - from azure.core.exceptions import ResourceNotFoundError + from azure.core.exceptions import ResourceNotFoundError # pylint: disable=unused-import from azure.storage.filedatalake import DataLakeServiceClient except ImportError: msg = ( diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py index 477c446348bbd..76e39b6006132 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py @@ -2,8 +2,8 @@ try: # Centralise Azure imports here so we only need to warn in one place - from azure.core.exceptions import ResourceNotFoundError - from azure.storage.blob import generate_blob_sas, BlobServiceClient + from azure.core.exceptions import ResourceNotFoundError # pylint: disable=unused-import + from azure.storage.blob import generate_blob_sas, BlobServiceClient # pylint: disable=unused-import except ImportError: msg = ( "Could not import required Azure objects. This probably means you have an old version " From a870f80a6b467cf91c736bfbaa03cfa203d89960 Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Tue, 2 Jun 2020 17:24:27 -0700 Subject: [PATCH 16/21] more buildkite --- .buildkite/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.py b/.buildkite/pipeline.py index e4e1c6718275f..682995198d4b9 100644 --- a/.buildkite/pipeline.py +++ b/.buildkite/pipeline.py @@ -245,7 +245,7 @@ def postgres_extra_cmds_fn(_): ), ModuleBuildSpec( 'python_modules/libraries/dagster-azure', - env_vars=['AZURE_DATA_LAKE_STORAGE_KEY', 'AZURE_STORAGE_ACCOUNT_KEY'], + env_vars=['AZURE_STORAGE_ACCOUNT_KEY'], ), ModuleBuildSpec( 'python_modules/libraries/dagster-celery', From 6c6da605cbdef9855e802839e01d830c21e10572 Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Tue, 2 Jun 2020 21:56:58 -0700 Subject: [PATCH 17/21] tox --- python_modules/libraries/dagster-azure/tox.ini | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python_modules/libraries/dagster-azure/tox.ini b/python_modules/libraries/dagster-azure/tox.ini index da4d1347591c1..022a9a415b106 100644 --- a/python_modules/libraries/dagster-azure/tox.ini +++ b/python_modules/libraries/dagster-azure/tox.ini @@ -22,3 +22,11 @@ commands = coverage report --omit='.tox/*,**/test_*.py' --skip-covered coverage html --omit='.tox/*,**/test_*.py' coverage xml --omit='.tox/*,**/test_*.py' + +[testenv:pylint] +whitelist_externals = + pylint +basepython = + python3.7 +commands = + pylint -j 0 --rcfile=../../../.pylintrc dagster_azure dagster_azure_tests From 49b4030143e8478405769ed4cbb1daccc10e46d7 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Wed, 3 Jun 2020 08:52:45 +0100 Subject: [PATCH 18/21] Run black on dagster_snowflake --- .../libraries/dagster-snowflake/dagster_snowflake/resources.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py b/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py index 914808296e176..80a1bbe3c47c8 100644 --- a/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py +++ b/python_modules/libraries/dagster-snowflake/dagster_snowflake/resources.py @@ -20,8 +20,6 @@ raise - - class SnowflakeConnection(object): def __init__(self, context): # pylint: disable=too-many-locals # Extract parameters from resource config. Note that we can't pass None values to From 3cda8b61f3813563cfbecaa38e5fe0e5094452fd Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Wed, 3 Jun 2020 08:55:19 +0100 Subject: [PATCH 19/21] Add pylint to dagster-azure tox.ini --- python_modules/libraries/dagster-azure/tox.ini | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python_modules/libraries/dagster-azure/tox.ini b/python_modules/libraries/dagster-azure/tox.ini index da4d1347591c1..ef532f45372f9 100644 --- a/python_modules/libraries/dagster-azure/tox.ini +++ b/python_modules/libraries/dagster-azure/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py{37,36,35,27}-{unix,windows} +envlist = py{37,36,35,27}-{unix,windows},pylint [testenv] passenv = CI_* COVERALLS_REPO_TOKEN AZURE_* BUILDKITE SSH_* @@ -22,3 +22,11 @@ commands = coverage report --omit='.tox/*,**/test_*.py' --skip-covered coverage html --omit='.tox/*,**/test_*.py' coverage xml --omit='.tox/*,**/test_*.py' + +[testenv:pylint] +whitelist_externals = + pylint +basepython = + python3.7 +commands = + pylint -j 0 --rcfile=../../../.pylintrc dagster_azure dagster_azure_tests From 3e8d8afcf3ff81d58fb6610b3021edf5f5872802 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Wed, 3 Jun 2020 09:07:18 +0100 Subject: [PATCH 20/21] Explicitly specify __all__ for dagster_azure utils modules --- .../libraries/dagster-azure/dagster_azure/adls2/utils.py | 3 +++ .../libraries/dagster-azure/dagster_azure/blob/utils.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py b/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py index 94278d8dc8dd0..aee8e17bf8450 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/adls2/utils.py @@ -26,3 +26,6 @@ def create_adls2_client(storage_account, credential): """ account_url = _create_url(storage_account, "dfs") return DataLakeServiceClient(account_url, credential) + + +__all__ = ['create_adls2_client', 'DataLakeServiceClient', 'ResourceNotFoundError'] diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py index 477c446348bbd..bb28ffce8d30e 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py @@ -28,3 +28,6 @@ def create_blob_client(storage_account, credential): if hasattr(credential, "account_key"): credential = credential.account_key return BlobServiceClient(account_url, credential) + + +__all__ = ['create_blob_client', 'generate_blob_sas', 'BlobServiceClient', 'ResourceNotFoundError'] From 96f08b10f3b6daa61d0a1f6ded0414766ec10781 Mon Sep 17 00:00:00 2001 From: Ben Sully Date: Wed, 3 Jun 2020 16:19:23 +0100 Subject: [PATCH 21/21] Fix black issues --- .../libraries/dagster-azure/dagster_azure/blob/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py b/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py index ce9018e956ad3..02bfbb44cfe72 100644 --- a/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py +++ b/python_modules/libraries/dagster-azure/dagster_azure/blob/utils.py @@ -2,8 +2,11 @@ try: # Centralise Azure imports here so we only need to warn in one place - from azure.core.exceptions import ResourceNotFoundError # pylint: disable=unused-import - from azure.storage.blob import generate_blob_sas, BlobServiceClient # pylint: disable=unused-import + from azure.core.exceptions import ResourceNotFoundError + from azure.storage.blob import ( + generate_blob_sas, + BlobServiceClient, + ) except ImportError: msg = ( "Could not import required Azure objects. This probably means you have an old version "