diff --git a/CHANGELOG.md b/CHANGELOG.md index 17daf8b5ce3d..1ffa5a216297 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Explicit "Done" button when drawing any polyshapes () - Histogram equalization with OpenCV javascript () - Client-side polyshapes approximation when using semi-automatic interactors & scissors () +- Support of Google Cloud Storage for cloud storage () ### Changed diff --git a/cvat/apps/engine/cache.py b/cvat/apps/engine/cache.py index 75196ee8403f..97dd420acfb0 100644 --- a/cvat/apps/engine/cache.py +++ b/cvat/apps/engine/cache.py @@ -87,7 +87,7 @@ def prepare_chunk_buff(self, db_data, quality, chunk_number): name = f"{item['name']}{item['extension']}" if name not in cloud_storage_instance: raise Exception('{} file was not found on a {} storage'.format(name, cloud_storage_instance.name)) - with NamedTemporaryFile(mode='w+b', prefix='cvat', suffix=name, delete=False) as temp_file: + with NamedTemporaryFile(mode='w+b', prefix='cvat', suffix=name.replace(os.path.sep, '#'), delete=False) as temp_file: source_path = temp_file.name buf = cloud_storage_instance.download_fileobj(name) temp_file.write(buf.getvalue()) diff --git a/cvat/apps/engine/cloud_provider.py b/cvat/apps/engine/cloud_provider.py index 017d5f7db9e0..31619bd13fee 100644 --- a/cvat/apps/engine/cloud_provider.py +++ b/cvat/apps/engine/cloud_provider.py @@ -1,6 +1,8 @@ #from dataclasses import dataclass from abc import ABC, abstractmethod, abstractproperty from io import BytesIO +import os +import os.path import boto3 from boto3.s3.transfer import TransferConfig @@ -11,6 +13,8 @@ from azure.core.exceptions import ResourceExistsError from azure.storage.blob import PublicAccess +from google.cloud import storage + from cvat.apps.engine.log import slogger from cvat.apps.engine.models import CredentialsTypeChoice, CloudProviderChoice @@ -42,6 +46,7 @@ def download_fileobj(self, key): def download_file(self, key, path): file_obj = self.download_fileobj(key) if isinstance(file_obj, BytesIO): + os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'wb') as f: f.write(file_obj.getvalue()) else: @@ -77,6 +82,14 @@ def get_cloud_storage_instance(cloud_provider, resource, credentials, specific_a account_name=credentials.account_name, sas_token=credentials.session_token ) + elif cloud_provider == CloudProviderChoice.GOOGLE_CLOUD_STORAGE: + instance = GoogleCloudStorage( + bucket_name=resource, + service_account_json=credentials.key_file_path, + prefix=specific_attributes.get('prefix'), + location=specific_attributes.get('location'), + project=specific_attributes.get('project') + ) else: raise NotImplementedError() return instance @@ -256,14 +269,89 @@ def download_fileobj(self, key): class GOOGLE_DRIVE(_CloudStorage): pass +class GoogleCloudStorage(_CloudStorage): + + def __init__(self, bucket_name, prefix=None, service_account_json=None, project=None, location=None): + super().__init__() + if service_account_json: + self._storage_client = storage.Client.from_service_account_json(service_account_json) + else: + self._storage_client = storage.Client() + + bucket = self._storage_client.lookup_bucket(bucket_name) + if bucket is None: + bucket = self._storage_client.bucket(bucket_name, user_project=project) + + self._bucket = bucket + self._bucket_location = location + self._prefix = prefix + + @property + def bucket(self): + return self._bucket + + @property + def name(self): + return self._bucket.name + + def exists(self): + return self._storage_client.lookup_bucket(self.name) is not None + + def initialize_content(self): + self._files = [ + { + 'name': blob.name + } + for blob in self._storage_client.list_blobs( + self.bucket, prefix=self._prefix + ) + ] + + def download_fileobj(self, key): + buf = BytesIO() + blob = self.bucket.blob(key) + self._storage_client.download_blob_to_file(blob, buf) + buf.seek(0) + return buf + + def is_object_exist(self, key): + return self.bucket.blob(key).exists() + + def upload_file(self, file_obj, file_name): + self.bucket.blob(file_name).upload_from_file(file_obj) + + def create(self): + try: + self._bucket = self._storage_client.create_bucket( + self.bucket, + location=self._bucket_location + ) + slogger.glob.info( + 'Bucket {} has been created at {} region for {}'.format( + self.name, + self.bucket.location, + self.bucket.user_project, + )) + except Exception as ex: + msg = str(ex) + slogger.glob.info(msg) + raise Exception(msg) + + def get_file_last_modified(self, key): + blob = self.bucket.blob(key) + blob.reload() + return blob.updated + + class Credentials: - __slots__ = ('key', 'secret_key', 'session_token', 'account_name', 'credentials_type') + __slots__ = ('key', 'secret_key', 'session_token', 'account_name', 'key_file_path', 'credentials_type') def __init__(self, **credentials): self.key = credentials.get('key', '') self.secret_key = credentials.get('secret_key', '') self.session_token = credentials.get('session_token', '') self.account_name = credentials.get('account_name', '') + self.key_file_path = credentials.get('key_file_path', '') self.credentials_type = credentials.get('credentials_type', None) def convert_to_db(self): @@ -271,6 +359,7 @@ def convert_to_db(self): CredentialsTypeChoice.TEMP_KEY_SECRET_KEY_TOKEN_SET : \ " ".join([self.key, self.secret_key, self.session_token]), CredentialsTypeChoice.ACCOUNT_NAME_TOKEN_PAIR : " ".join([self.account_name, self.session_token]), + CredentialsTypeChoice.KEY_FILE_PATH: self.key_file_path, CredentialsTypeChoice.ANONYMOUS_ACCESS: "", } return converted_credentials[self.credentials_type] @@ -281,6 +370,8 @@ def convert_from_db(self, credentials): self.key, self.secret_key, self.session_token = credentials.get('value').split() elif self.credentials_type == CredentialsTypeChoice.ACCOUNT_NAME_TOKEN_PAIR: self.account_name, self.session_token = credentials.get('value').split() + elif self.credentials_type == CredentialsTypeChoice.KEY_FILE_PATH: + self.key_file_path = credentials.get('value') else: self.account_name, self.session_token, self.key, self.secret_key = ('', '', '', '') self.credentials_type = None @@ -291,6 +382,7 @@ def mapping_with_new_values(self, credentials): self.secret_key = credentials.get('secret_key', self.secret_key) self.session_token = credentials.get('session_token', self.session_token) self.account_name = credentials.get('account_name', self.account_name) + self.key_file_path = credentials.get('key_file_path', self.key_file_path) def values(self): - return [self.key, self.secret_key, self.session_token, self.account_name] + return [self.key, self.secret_key, self.session_token, self.account_name, self.key_file_path] diff --git a/cvat/apps/engine/migrations/0041_auto_20210827_0258.py b/cvat/apps/engine/migrations/0041_auto_20210827_0258.py new file mode 100644 index 000000000000..0e089c0e630a --- /dev/null +++ b/cvat/apps/engine/migrations/0041_auto_20210827_0258.py @@ -0,0 +1,23 @@ +# Generated by Django 3.1.13 on 2021-08-27 02:58 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('engine', '0040_cloud_storage'), + ] + + operations = [ + migrations.AlterField( + model_name='cloudstorage', + name='credentials_type', + field=models.CharField(choices=[('TEMP_KEY_SECRET_KEY_TOKEN_SET', 'TEMP_KEY_SECRET_KEY_TOKEN_SET'), ('ACCOUNT_NAME_TOKEN_PAIR', 'ACCOUNT_NAME_TOKEN_PAIR'), ('KEY_FILE_PATH', 'KEY_FILE_PATH'), ('ANONYMOUS_ACCESS', 'ANONYMOUS_ACCESS')], max_length=29), + ), + migrations.AlterField( + model_name='cloudstorage', + name='provider_type', + field=models.CharField(choices=[('AWS_S3_BUCKET', 'AWS_S3'), ('AZURE_CONTAINER', 'AZURE_CONTAINER'), ('GOOGLE_DRIVE', 'GOOGLE_DRIVE'), ('GOOGLE_CLOUD_STORAGE', 'GOOGLE_CLOUD_STORAGE')], max_length=20), + ), + ] diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py index 78ec751abd25..73c0da4979fa 100644 --- a/cvat/apps/engine/models.py +++ b/cvat/apps/engine/models.py @@ -542,6 +542,7 @@ class CloudProviderChoice(str, Enum): AWS_S3 = 'AWS_S3_BUCKET' AZURE_CONTAINER = 'AZURE_CONTAINER' GOOGLE_DRIVE = 'GOOGLE_DRIVE' + GOOGLE_CLOUD_STORAGE = 'GOOGLE_CLOUD_STORAGE' @classmethod def choices(cls): @@ -558,6 +559,7 @@ class CredentialsTypeChoice(str, Enum): # ignore bandit issues because false positives TEMP_KEY_SECRET_KEY_TOKEN_SET = 'TEMP_KEY_SECRET_KEY_TOKEN_SET' # nosec ACCOUNT_NAME_TOKEN_PAIR = 'ACCOUNT_NAME_TOKEN_PAIR' # nosec + KEY_FILE_PATH = 'KEY_FILE_PATH' ANONYMOUS_ACCESS = 'ANONYMOUS_ACCESS' @classmethod diff --git a/cvat/apps/engine/serializers.py b/cvat/apps/engine/serializers.py index f50e799bf427..784c71b88ec5 100644 --- a/cvat/apps/engine/serializers.py +++ b/cvat/apps/engine/serializers.py @@ -792,6 +792,7 @@ class CloudStorageSerializer(serializers.ModelSerializer): session_token = serializers.CharField(max_length=440, allow_blank=True, required=False) key = serializers.CharField(max_length=20, allow_blank=True, required=False) secret_key = serializers.CharField(max_length=40, allow_blank=True, required=False) + key_file_path = serializers.CharField(max_length=64, allow_blank=True, required=False) account_name = serializers.CharField(max_length=24, allow_blank=True, required=False) class Meta: @@ -799,7 +800,7 @@ class Meta: fields = ( 'provider_type', 'resource', 'display_name', 'owner', 'credentials_type', 'created_date', 'updated_date', 'session_token', 'account_name', 'key', - 'secret_key', 'specific_attributes', 'description' + 'secret_key', 'key_file_path', 'specific_attributes', 'description' ) read_only_fields = ('created_date', 'updated_date', 'owner') @@ -816,6 +817,9 @@ def validate(self, attrs): if attrs.get('provider_type') == models.CloudProviderChoice.AZURE_CONTAINER: if not attrs.get('account_name', ''): raise serializers.ValidationError('Account name for Azure container was not specified') + if attrs.get('provider_type') == models.CloudProviderChoice.GOOGLE_CLOUD_STORAGE: + if not attrs.get('key_file_path', ''): + raise serializers.ValidationError('Key file path for Google cloud storage was not specified') return attrs def create(self, validated_data): @@ -826,6 +830,7 @@ def create(self, validated_data): key=validated_data.pop('key', ''), secret_key=validated_data.pop('secret_key', ''), session_token=validated_data.pop('session_token', ''), + key_file_path=validated_data.pop('key_file_path', ''), credentials_type = validated_data.get('credentials_type') ) if should_be_created: @@ -859,7 +864,7 @@ def update(self, instance, validated_data): 'type': instance.credentials_type, 'value': instance.credentials, }) - tmp = {k:v for k,v in validated_data.items() if k in {'key', 'secret_key', 'account_name', 'session_token', 'credentials_type'}} + tmp = {k:v for k,v in validated_data.items() if k in {'key', 'secret_key', 'account_name', 'session_token', 'key_file_path', 'credentials_type'}} credentials.mapping_with_new_values(tmp) instance.credentials = credentials.convert_to_db() instance.credentials_type = validated_data.get('credentials_type', instance.credentials_type) diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py index afe9ef504fa8..9462f20b7b85 100644 --- a/cvat/apps/engine/views.py +++ b/cvat/apps/engine/views.py @@ -1262,7 +1262,8 @@ def perform_create(self, serializer): session_token=serializer.validated_data.get('session_token', ''), account_name=serializer.validated_data.get('account_name', ''), key=serializer.validated_data.get('key', ''), - secret_key=serializer.validated_data.get('secret_key', '') + secret_key=serializer.validated_data.get('secret_key', ''), + key_file_path=serializer.validated_data.get('key_file_path', '') ) details = { 'resource': serializer.validated_data.get('resource'), diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt index 2ec265159666..4f8190ac4794 100644 --- a/cvat/requirements/base.txt +++ b/cvat/requirements/base.txt @@ -47,6 +47,7 @@ diskcache==5.0.2 open3d==0.11.2 boto3==1.17.61 azure-storage-blob==12.8.1 +google-cloud-storage==1.42.0 # --no-binary=datumaro: workaround for pip to install # opencv-headless instead of regular opencv, to actually run setup script # --no-binary=pycocotools: workaround for binary incompatibility on numpy 1.20