diff --git a/.gitignore b/.gitignore index 92094448373..dbb0fa606cb 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ logs/* media/dash media/epub media/export +media/html media/htmlzip media/json media/man diff --git a/docs/settings.rst b/docs/settings.rst index 195fddecc24..2b10ee4d6d1 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -94,6 +94,16 @@ Default: :djangosetting:`ALLOW_ADMIN` Whether to include `django.contrib.admin` in the URL's. +RTD_BUILD_MEDIA_STORAGE +----------------------- + +Default: ``None`` + +Use this storage class to upload build artifacts to cloud storage (S3, Azure storage). +This should be a dotted path to the relevant class (eg. ``'path.to.MyBuildMediaStorage'``). +This class should mixin :class:`readthedocs.builds.storage.BuildMediaStorageMixin`. + + ELASTICSEARCH_DSL ----------------- diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py new file mode 100644 index 00000000000..f79406428a9 --- /dev/null +++ b/readthedocs/builds/storage.py @@ -0,0 +1,105 @@ +import logging +from pathlib import Path + +from django.core.files.storage import FileSystemStorage +from storages.utils import safe_join, get_available_overwrite_name + + +log = logging.getLogger(__name__) + + +class BuildMediaStorageMixin: + + """ + A mixin for Storage classes needed to write build artifacts + + This adds and modifies some functionality to Django's File Storage API. + By default, classes mixing this in will now overwrite files by default instead + of finding an available name. + This mixin also adds convenience methods to copy and delete entire directories. + + See: https://docs.djangoproject.com/en/1.11/ref/files/storage + """ + + @staticmethod + def _dirpath(path): + """It may just be Azure, but for listdir to work correctly, the path must end in '/'""" + path = str(path) + if not path.endswith('/'): + path += '/' + + return path + + def get_available_name(self, name, max_length=None): + """ + Overrides Django's storage implementation to always return the passed name (overwrite) + + By default, Django will not overwrite files even if the same name is specified. + This changes that functionality so that the default is to use the same name and overwrite + rather than modify the path to not clobber files. + """ + return get_available_overwrite_name(name, max_length=max_length) + + def delete_directory(self, path): + """ + Delete all files under a certain path from storage + + Many storage backends (S3, Azure storage) don't care about "directories". + The directory effectively doesn't exist if there are no files in it. + However, in these backends, there is no "rmdir" operation so you have to recursively + delete all files. + + :param path: the path to the directory to remove + """ + log.debug('Deleting directory %s from media storage', path) + folders, files = self.listdir(self._dirpath(path)) + for folder_name in folders: + if folder_name: + # Recursively delete the subdirectory + self.delete_directory(safe_join(path, folder_name)) + for filename in files: + if filename: + self.delete(safe_join(path, filename)) + + def copy_directory(self, source, destination): + """ + Copy a directory recursively to storage + + :param source: the source path on the local disk + :param destination: the destination path in storage + """ + log.debug('Copying source directory %s to media storage at %s', source, destination) + source = Path(source) + for filepath in source.iterdir(): + sub_destination = safe_join(destination, filepath.name) + if filepath.is_dir(): + # Recursively copy the subdirectory + self.copy_directory(filepath, sub_destination) + elif filepath.is_file(): + with filepath.open('rb') as fd: + self.save(sub_destination, fd) + + +class BuildMediaFileSystemStorage(BuildMediaStorageMixin, FileSystemStorage): + + """Storage subclass that writes build artifacts under MEDIA_ROOT""" + + def get_available_name(self, name, max_length=None): + """ + A hack to overwrite by default with the FileSystemStorage + + After upgrading to Django 2.2, this method can be removed + because subclasses can set OS_OPEN_FLAGS such that FileSystemStorage._save + will properly overwrite files. + See: https://github.com/django/django/pull/8476 + """ + name = super().get_available_name(name, max_length=max_length) + if self.exists(name): + self.delete(name) + return name + + def listdir(self, path): + """Return empty lists for nonexistent directories (as cloud storages do)""" + if not self.exists(path): + return [], [] + return super().listdir(path) diff --git a/readthedocs/builds/syncers.py b/readthedocs/builds/syncers.py index 494ea23644b..bee327e88e0 100644 --- a/readthedocs/builds/syncers.py +++ b/readthedocs/builds/syncers.py @@ -10,7 +10,6 @@ import shutil from django.conf import settings -from django.core.exceptions import SuspiciousFileOperation from django.core.files.storage import get_storage_class from readthedocs.core.utils import safe_makedirs @@ -30,6 +29,15 @@ def copy(cls, path, target, is_file=False, **kwargs): raise NotImplementedError +class NullSyncer: + + """A syncer that doesn't actually do anything""" + + @classmethod + def copy(cls, path, target, is_file=False, **kwargs): + pass # noqa + + class LocalSyncer(BaseSyncer): @classmethod @@ -167,62 +175,6 @@ def copy(cls, path, target, host, is_file=False, **kwargs): # pylint: disable=a ) -class SelectiveStorageRemotePuller(RemotePuller): - - """ - Like RemotePuller but certain files are copied via Django's storage system. - - If a file with extensions specified by ``extensions`` is copied, it will be copied to storage - and the original is removed. - - See: https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-DEFAULT_FILE_STORAGE - """ - - extensions = ('.pdf', '.epub', '.zip') - - @classmethod - def get_storage_path(cls, path): - """ - Gets the path to the file within the storage engine. - - For example, if the path was $MEDIA_ROOT/pdfs/latest.pdf - the storage_path is 'pdfs/latest.pdf' - - :raises: SuspiciousFileOperation if the path isn't under settings.MEDIA_ROOT - """ - path = os.path.normpath(path) - if not path.startswith(settings.MEDIA_ROOT): - raise SuspiciousFileOperation - - path = path.replace(settings.MEDIA_ROOT, '').lstrip('/') - return path - - @classmethod - def copy(cls, path, target, host, is_file=False, **kwargs): # pylint: disable=arguments-differ - RemotePuller.copy(path, target, host, is_file, **kwargs) - - if getattr(storage, 'write_build_media', False): - # This is a sanity check for the case where - # storage is backed by the local filesystem - # In that case, removing the original target file locally - # would remove the file from storage as well - - if is_file and os.path.exists(target) and \ - any([target.lower().endswith(ext) for ext in cls.extensions]): - log.info('Selective Copy %s to media storage', target) - - try: - storage_path = cls.get_storage_path(target) - - if storage.exists(storage_path): - storage.delete(storage_path) - - with open(target, 'rb') as fd: - storage.save(storage_path, fd) - except Exception: - log.exception('Storage access failed for file. Not failing build.') - - class Syncer(SettingsOverrideObject): _default_class = LocalSyncer _override_setting = 'FILE_SYNCER' diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 4626c035b24..8645aae315e 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -504,23 +504,29 @@ def get_subproject_urls(self): return [(proj.child.slug, proj.child.get_docs_url()) for proj in self.subprojects.all()] - def get_storage_path(self, type_, version_slug=LATEST): + def get_storage_path(self, type_, version_slug=LATEST, include_file=True): """ Get a path to a build artifact for use with Django's storage system. :param type_: Media content type, ie - 'pdf', 'htmlzip' :param version_slug: Project version slug for lookup + :param include_file: Include file name in return :return: the path to an item in storage (can be used with ``storage.url`` to get the URL) """ - extension = type_.replace('htmlzip', 'zip') - return '{}/{}/{}/{}.{}'.format( + folder_path = '{}/{}/{}'.format( type_, self.slug, version_slug, - self.slug, - extension, ) + if include_file: + extension = type_.replace('htmlzip', 'zip') + return '{}/{}.{}'.format( + folder_path, + self.slug, + extension, + ) + return folder_path def get_production_media_path(self, type_, version_slug, include_file=True): """ diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index e0e1c219502..4291f1dbc05 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -81,7 +81,6 @@ log = logging.getLogger(__name__) -storage = get_storage_class()() class SyncRepositoryMixin: @@ -571,18 +570,28 @@ def run_build(self, docker, record): raise VersionLockedError except SoftTimeLimitExceeded: raise BuildTimeoutError - - # Finalize build and update web servers - if self.build.get('id'): - self.update_app_instances( - html=bool(outcomes['html']), - search=bool(outcomes['search']), - localmedia=bool(outcomes['localmedia']), - pdf=bool(outcomes['pdf']), - epub=bool(outcomes['epub']), - ) else: - log.warning('No build ID, not syncing files') + build_id = self.build.get('id') + if build_id: + # Store build artifacts to storage (local or cloud storage) + self.store_build_artifacts( + html=bool(outcomes['html']), + search=bool(outcomes['search']), + localmedia=bool(outcomes['localmedia']), + pdf=bool(outcomes['pdf']), + epub=bool(outcomes['epub']), + ) + + # Finalize build and update web servers + self.update_app_instances( + html=bool(outcomes['html']), + search=bool(outcomes['search']), + localmedia=bool(outcomes['localmedia']), + pdf=bool(outcomes['pdf']), + epub=bool(outcomes['epub']), + ) + else: + log.warning('No build ID, not syncing files') if self.build_env.failed: self.send_notifications(self.version.pk, self.build['id']) @@ -717,6 +726,128 @@ def save_build_config(self): }) self.build['config'] = config + def store_build_artifacts( + self, + html=False, + localmedia=False, + search=False, + pdf=False, + epub=False, + ): + """ + Save build artifacts to "storage" using Django's storage API + + The storage could be local filesystem storage OR cloud blob storage + such as S3, Azure storage or Google Cloud Storage. + + Remove build artifacts of types not included in this build (PDF, ePub, zip only). + + This looks very similar to `move_files` and is intended to replace it! + + :param html: whether to save HTML output + :param localmedia: whether to save localmedia (htmlzip) output + :param search: whether to save search artifacts + :param pdf: whether to save PDF output + :param epub: whether to save ePub output + """ + if settings.RTD_BUILD_MEDIA_STORAGE: + log.info( + LOG_TEMPLATE, + { + 'project': self.version.project.slug, + 'version': self.version.slug, + 'msg': 'Writing build artifacts to media storage', + }, + ) + + storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)() + + types_to_copy = [] + types_to_delete = [] + + # HTML media + if html: + types_to_copy.append(('html', self.config.doctype)) + + # Search media (JSON) + if search: + types_to_copy.append(('json', 'sphinx_search')) + + if localmedia: + types_to_copy.append(('htmlzip', 'sphinx_localmedia')) + else: + types_to_delete.append('htmlzip') + + if pdf: + types_to_copy.append(('pdf', 'sphinx_pdf')) + else: + types_to_delete.append('pdf') + + if epub: + types_to_copy.append(('epub', 'sphinx_epub')) + else: + types_to_delete.append('epub') + + for media_type, build_type in types_to_copy: + from_path = self.version.project.artifact_path( + version=self.version.slug, + type_=build_type, + ) + to_path = self.version.project.get_storage_path( + type_=media_type, + version_slug=self.version.slug, + include_file=False, + ) + log.info( + LOG_TEMPLATE, + { + 'project': self.version.project.slug, + 'version': self.version.slug, + 'msg': f'Writing {media_type} to media storage - {to_path}', + }, + ) + try: + storage.copy_directory(from_path, to_path) + except Exception: + # Ideally this should just be an IOError + # but some storage backends unfortunately throw other errors + log.exception( + LOG_TEMPLATE, + { + 'project': self.version.project.slug, + 'version': self.version.slug, + 'msg': f'Error copying {from_path} to storage (not failing build)', + }, + ) + + for media_type in types_to_delete: + media_path = self.version.project.get_storage_path( + type_=media_type, + version_slug=self.version.slug, + include_file=False, + ) + log.info( + LOG_TEMPLATE, + { + 'project': self.version.project.slug, + 'version': self.version.slug, + 'msg': f'Deleting {media_type} from media storage - {media_path}', + }, + ) + try: + storage.delete_directory(media_path) + except Exception: + # Ideally this should just be an IOError + # but some storage backends unfortunately throw other errors + log.exception( + LOG_TEMPLATE, + { + 'project': self.version.project.slug, + 'version': self.version.slug, + 'msg': f'Error deleting {media_path} from storage (not failing build)', + }, + ) + def update_app_instances( self, html=False, @@ -747,25 +878,6 @@ def update_app_instances( delete_unsynced_media = True - if getattr(storage, 'write_build_media', False): - # Handle the case where we want to upload some built assets to our storage - move_files.delay( - self.version.pk, - hostname, - self.config.doctype, - html=False, - search=False, - localmedia=localmedia, - pdf=pdf, - epub=epub, - delete_unsynced_media=delete_unsynced_media, - ) - # Set variables so they don't get synced in the next broadcast step - localmedia = False - pdf = False - epub = False - delete_unsynced_media = False - # Broadcast finalization steps to web application instances broadcast( type='app', @@ -1026,16 +1138,6 @@ def move_files( ), ]) - if getattr(storage, 'write_build_media', False): - # Remove the media from remote storage if it exists - storage_path = version.project.get_storage_path( - type_=media_type, - version_slug=version.slug, - ) - if storage.exists(storage_path): - log.info('Removing %s from media storage', storage_path) - storage.delete(storage_path) - log.debug( LOG_TEMPLATE, { diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index ae63ac9fe51..0233c97d1ee 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -208,6 +208,10 @@ def USE_PROMOS(self): # noqa ] PYTHON_MEDIA = False + # Optional Django Storage subclass used to write build artifacts to cloud or local storage + # https://docs.readthedocs.io/en/stable/settings.html#build-media-storage + RTD_BUILD_MEDIA_STORAGE = None + TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', diff --git a/requirements/pip.txt b/requirements/pip.txt index 8ecd7b1e428..51b75bab1cd 100644 --- a/requirements/pip.txt +++ b/requirements/pip.txt @@ -98,5 +98,8 @@ django-cors-middleware==1.3.1 # User agent parsing - used for analytics purposes user-agents<1.2.0 +# Utilities used to upload build media to cloud storage +django-storages==1.7.1 # pyup: <1.8 + # Required only in development and linting django-debug-toolbar==1.11