feat: add support for static assets

This alters the data model in the components app in a number of ways in order to support static assets, along with some refactoring: * Content has been renamed to RawContent to make it more clear when we are talking about "content" as a general concept and the actual data model that holds the raw bytes. * RawContent now uses FileField instead of BinaryField, giving us more cheaply scalable storage in exchange for higher latency. This is offset by the new TextContent model that will be used to store the text versions of RawContent that needs low-latency access (like XBlock OLX). * A primitive media_server app now exists to view static assets during development. It is NOT safe to use on a running site yet.
openedx · Apr 14, 2023 · d18dc2d · d18dc2d
1 parent 867ea8d
commit d18dc2d
Show file tree

Hide file tree

Showing 30 changed files with 1,016 additions and 558 deletions.
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,5 @@ dev.db
 
 .vscode
 
+# Media files (for uploads)
+media/
diff --git a/olx_importer/apps.py b/olx_importer/apps.py
@@ -9,5 +9,6 @@ class OLXImporterConfig(AppConfig):
     """
     Configuration for the OLX Importer Django application.
     """
+
     name = "olx_importer"
     verbose_name = "OLX Importer"
diff --git a/olx_importer/management/commands/load_components.py b/olx_importer/management/commands/load_components.py
@@ -11,9 +11,9 @@
 
 Open Question: If the data model is extensible, how do we know whether a change
 has really happened between what's currently stored/published for a particular
-item and the new value we want to set? For Content that's easy, because we have
-actual hashes of the data. But it's not clear how that would work for something
-like an ComponentVersion. We'd have to have some kind of mechanism where every 
+item and the new value we want to set? For RawContent that's easy, because we
+have actual hashes of the data. But it's not clear how that would work for
+something like an ComponentVersion. We'd have to have some kind of mechanism where every 
 pp that wants to attach data gets to answer the question of "has anything
 changed?" in order to decide if we really make a new ComponentVersion or not.
 """
@@ -25,22 +25,28 @@
 import re
 import xml.etree.ElementTree as ET
 
+from django.core.files.base import ContentFile
 from django.core.management.base import BaseCommand
 from django.db import transaction
 
 from openedx_learning.core.publishing.models import LearningPackage, PublishLogEntry
 from openedx_learning.core.components.models import (
-    Content, Component, ComponentVersion, ComponentVersionContent,
-    ComponentPublishLogEntry, PublishedComponent,
+    Component,
+    ComponentVersion,
+    ComponentVersionRawContent,
+    ComponentPublishLogEntry,
+    PublishedComponent,
+    RawContent,
+    TextContent,
 )
 from openedx_learning.lib.fields import create_hash_digest
 
-SUPPORTED_TYPES = ['problem', 'video', 'html']
+SUPPORTED_TYPES = ["problem", "video", "html"]
 logger = logging.getLogger(__name__)
 
 
 class Command(BaseCommand):
-    help = 'Load sample Component data from course export'
+    help = "Load sample Component data from course export"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -61,20 +67,19 @@ def init_known_types(self):
         # officially "text/javascript"
         mimetypes.add_type("text/javascript", ".js")
         mimetypes.add_type("text/javascript", ".mjs")
-
 
     def add_arguments(self, parser):
-        parser.add_argument('course_data_path', type=pathlib.Path)
-        parser.add_argument('learning_package_identifier', type=str)
+        parser.add_argument("course_data_path", type=pathlib.Path)
+        parser.add_argument("learning_package_identifier", type=str)
 
     def handle(self, course_data_path, learning_package_identifier, **options):
         self.course_data_path = course_data_path
         self.learning_package_identifier = learning_package_identifier
         self.load_course_data(learning_package_identifier)
 
     def get_course_title(self):
-        course_type_dir = self.course_data_path / 'course'
-        course_xml_file = next(course_type_dir.glob('*.xml'))
+        course_type_dir = self.course_data_path / "course"
+        course_xml_file = next(course_type_dir.glob("*.xml"))
         course_root = ET.parse(course_xml_file).getroot()
         return course_root.attrib.get("display_name", "Unknown Course")
 
@@ -87,9 +92,9 @@ def load_course_data(self, learning_package_identifier):
             learning_package, _created = LearningPackage.objects.get_or_create(
                 identifier=learning_package_identifier,
                 defaults={
-                    'title': title,
-                    'created': now,
-                    'updated': now,
+                    "title": title,
+                    "created": now,
+                    "updated": now,
                 },
             )
             self.learning_package = learning_package
@@ -105,11 +110,13 @@ def load_course_data(self, learning_package_identifier):
                 self.import_block_type(block_type, now, publish_log_entry)
 
     def create_content(self, static_local_path, now, component_version):
-        identifier = pathlib.Path('static') / static_local_path
+        identifier = pathlib.Path("static") / static_local_path
         real_path = self.course_data_path / identifier
         mime_type, _encoding = mimetypes.guess_type(identifier)
         if mime_type is None:
-            logger.error(f"  no mimetype found for {real_path}, defaulting to application/binary")
+            logger.error(
+                f"  no mimetype found for {real_path}, defaulting to application/binary"
+            )
             mime_type = "application/binary"
 
         try:
@@ -120,20 +127,26 @@ def create_content(self, static_local_path, now, component_version):
 
         hash_digest = create_hash_digest(data_bytes)
 
-        content, _created = Content.objects.get_or_create(
+        raw_content, created = RawContent.objects.get_or_create(
             learning_package=self.learning_package,
             mime_type=mime_type,
             hash_digest=hash_digest,
-            defaults = dict(
-                data=data_bytes,
+            defaults=dict(
                 size=len(data_bytes),
                 created=now,
-            )
+            ),
         )
-        ComponentVersionContent.objects.get_or_create(
+        if created:
+            raw_content.file.save(
+                f"{raw_content.learning_package.uuid}/{hash_digest}",
+                ContentFile(data_bytes),
+            )
+
+        ComponentVersionRawContent.objects.get_or_create(
             component_version=component_version,
-            content=content,
+            raw_content=raw_content,
             identifier=identifier,
+            learner_downloadable=True,
         )
 
     def import_block_type(self, block_type, now, publish_log_entry):
@@ -146,35 +159,49 @@ def import_block_type(self, block_type, now, publish_log_entry):
         static_files_regex = re.compile(r"""['"]\/static\/(.+?)["'\?]""")
         block_data_path = self.course_data_path / block_type
 
-        for xml_file_path in block_data_path.glob('*.xml'):
+        for xml_file_path in block_data_path.glob("*.xml"):
             components_found += 1
             identifier = xml_file_path.stem
 
             # Find or create the Component itself
             component, _created = Component.objects.get_or_create(
                 learning_package=self.learning_package,
-                namespace='xblock.v1',
+                namespace="xblock.v1",
                 type=block_type,
                 identifier=identifier,
-                defaults = {
-                    'created': now,
-                }
+                defaults={
+                    "created": now,
+                },
             )
 
-            # Create the Content entry for the raw data...
+            # Create the RawContent entry for the raw data...
             data_bytes = xml_file_path.read_bytes()
             hash_digest = create_hash_digest(data_bytes)
-            data_str = codecs.decode(data_bytes, 'utf-8')
-            content, _created = Content.objects.get_or_create(
+
+            raw_content, created = RawContent.objects.get_or_create(
                 learning_package=self.learning_package,
-                mime_type=f'application/vnd.openedx.xblock.v1.{block_type}+xml',
+                mime_type=f"application/vnd.openedx.xblock.v1.{block_type}+xml",
                 hash_digest=hash_digest,
-                defaults = dict(
-                    data=data_bytes,
+                defaults=dict(
+                    # text=data_str,
                     size=len(data_bytes),
                     created=now,
-                )
+                ),
             )
+            if created:
+                raw_content.file.save(
+                    f"{raw_content.learning_package.uuid}/{hash_digest}",
+                    ContentFile(data_bytes),
+                )
+
+                # Decode as utf-8-sig in order to strip any BOM from the data.
+                data_str = codecs.decode(data_bytes, "utf-8-sig")
+                TextContent.objects.create(
+                    raw_content=raw_content,
+                    text=data_str,
+                    length=len(data_str),
+                )
+
             # TODO: Get associated file contents, both with the static regex, as
             # well as with XBlock-specific code that examines attributes in
             # video and HTML tag definitions.
@@ -185,7 +212,7 @@ def import_block_type(self, block_type, now, publish_log_entry):
                 logger.error(f"Parse error for {xml_file_path}: {err}")
                 continue
 
-            display_name = block_root.attrib.get('display_name', "")
+            display_name = block_root.attrib.get("display_name", "")
 
             # Create the ComponentVersion
             component_version = ComponentVersion.objects.create(
@@ -195,10 +222,11 @@ def import_block_type(self, block_type, now, publish_log_entry):
                 created=now,
                 created_by=None,
             )
-            ComponentVersionContent.objects.create(
+            ComponentVersionRawContent.objects.create(
                 component_version=component_version,
-                content=content,
-                identifier='source.xml',
+                raw_content=raw_content,
+                identifier="source.xml",
+                learner_downloadable=False,
             )
             static_files_found = static_files_regex.findall(data_str)
             for static_local_path in static_files_found:

diff --git a/openedx_learning/contrib/media_server/__init__.py b/openedx_learning/contrib/media_server/__init__.py
diff --git a/openedx_learning/contrib/media_server/apps.py b/openedx_learning/contrib/media_server/apps.py
@@ -0,0 +1,22 @@
+from django.apps import AppConfig
+from django.core.exceptions import ImproperlyConfigured
+from django.conf import settings
+
+
+class MediaServerConfig(AppConfig):
+    """
+    Configuration for the Media Server application.
+    """
+
+    name = "openedx_learning.contrib.media_server"
+    verbose_name = "Learning Core: Media Server"
+    default_auto_field = "django.db.models.BigAutoField"
+
+    def ready(self):
+        if not settings.DEBUG:
+            # Until we get proper security and support for running this app
+            # under a separate domain, just don't allow it to be run in
+            # production environments.
+            raise ImproperlyConfigured(
+                "The media_server app should only be run in DEBUG mode!"
+            )
diff --git a/openedx_learning/contrib/media_server/readme.rst b/openedx_learning/contrib/media_server/readme.rst
@@ -0,0 +1,16 @@
+Media Server App
+================
+
+The ``media_server`` app exists to serve media files that are ultimately backed by the RawContent model, *for development purposes and for sites with light-to-moderate traffic*. It also provides an API that can be used by CDNs for high traffic sites.
+
+Motivation
+----------
+
+The ``components`` app stores large binary file data by calculating the hash and creating a django-storages backed file named after that hash. This is efficient from a storage point of view, because we don't store redundant copies for every version of a Component. There are at least two drawbacks:
+
+* We have unintelligibly named files that are confusing for clients.
+* Intra-file links between media files break. For instance, if we have a piece of HTML that makes a reference to a VTT file, that filename will have changed.
+
+This app tries to bridge that gap by serving URLs that preserve the original file names and give the illusion that there is a seprate set of media files for every version of a Component, but does a lookup behind the scenes to serve the correct hash-based-file.
+
+The big caveat on this is that Django is not really optimized to do this sort of asset serving. The most scalable approach is to have a CDN-backed solution where ``media_server`` serves the locations of files that are converted by worker code to serving the actual assets. (More details to follow when that part gets built out.)
diff --git a/openedx_learning/contrib/media_server/urls.py b/openedx_learning/contrib/media_server/urls.py
@@ -0,0 +1,16 @@
+from django.urls import path
+
+from .views import component_asset
+
+urlpatterns = [
+    path(
+        (
+            "component_asset/"
+            "<str:learning_package_identifier>/"
+            "<str:component_identifier>/"
+            "<int:version_num>/"
+            "<path:asset_path>"
+        ),
+        component_asset,
+    )
+]
diff --git a/openedx_learning/contrib/media_server/views.py b/openedx_learning/contrib/media_server/views.py
@@ -0,0 +1,27 @@
+"""
+Early views 
+"""
+from django.http import FileResponse
+from django.http import Http404
+from django.core.exceptions import ObjectDoesNotExist, PermissionDenied
+
+from openedx_learning.core.components.api import get_component_version_content
+
+
+def component_asset(
+    request, learning_package_identifier, component_identifier, version_num, asset_path
+):
+    try:
+        cvc = get_component_version_content(
+            learning_package_identifier, component_identifier, version_num, asset_path
+        )
+    except ObjectDoesNotExist:
+        raise Http404("File not found")
+
+    if not cvc.learner_downloadable and not (
+        request.user and request.user.is_superuser
+    ):
+        raise PermissionDenied("This file is not publicly downloadable.")
+
+    # TODO: Etag support
+    return FileResponse(cvc.raw_content.file, filename=cvc.identifier)
diff --git a/openedx_learning/contrib/readme.rst b/openedx_learning/contrib/readme.rst
@@ -0,0 +1,9 @@
+Contrib Package
+===============
+
+The ``contrib`` package holds Django apps that *could* be implemented in separate repos, but are bundled here because it's more convenient to do so.
+
+Guidelines
+----------
+
+Nothing from ``lib`` or ``core`` should *ever* import from ``contrib``.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -65,3 +65,5 @@ dev.db

		.vscode

		# Media files (for uploads)
		media/