Skip to content

Commit

Permalink
feat: add support for static assets
Browse files Browse the repository at this point in the history
This alters the data model in the components app in a number of ways in
order to support static assets, along with some refactoring:

* Content has been renamed to RawContent to make it more clear when we
  are talking about "content" as a general concept and the actual data
  model that holds the raw bytes.
* RawContent now uses FileField instead of BinaryField, giving us more
  cheaply scalable storage in exchange for higher latency. This is
  offset by the new TextContent model that will be used to store the
  text versions of RawContent that needs low-latency access (like XBlock
  OLX).
* A primitive media_server app now exists to view static assets during
  development. It is NOT safe to use on a running site yet.
  • Loading branch information
David Ormsbee committed Apr 14, 2023
1 parent 867ea8d commit d18dc2d
Show file tree
Hide file tree
Showing 30 changed files with 1,016 additions and 558 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,5 @@ dev.db

.vscode

# Media files (for uploads)
media/
1 change: 1 addition & 0 deletions olx_importer/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ class OLXImporterConfig(AppConfig):
"""
Configuration for the OLX Importer Django application.
"""

name = "olx_importer"
verbose_name = "OLX Importer"
106 changes: 67 additions & 39 deletions olx_importer/management/commands/load_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
Open Question: If the data model is extensible, how do we know whether a change
has really happened between what's currently stored/published for a particular
item and the new value we want to set? For Content that's easy, because we have
actual hashes of the data. But it's not clear how that would work for something
like an ComponentVersion. We'd have to have some kind of mechanism where every
item and the new value we want to set? For RawContent that's easy, because we
have actual hashes of the data. But it's not clear how that would work for
something like an ComponentVersion. We'd have to have some kind of mechanism where every
pp that wants to attach data gets to answer the question of "has anything
changed?" in order to decide if we really make a new ComponentVersion or not.
"""
Expand All @@ -25,22 +25,28 @@
import re
import xml.etree.ElementTree as ET

from django.core.files.base import ContentFile
from django.core.management.base import BaseCommand
from django.db import transaction

from openedx_learning.core.publishing.models import LearningPackage, PublishLogEntry
from openedx_learning.core.components.models import (
Content, Component, ComponentVersion, ComponentVersionContent,
ComponentPublishLogEntry, PublishedComponent,
Component,
ComponentVersion,
ComponentVersionRawContent,
ComponentPublishLogEntry,
PublishedComponent,
RawContent,
TextContent,
)
from openedx_learning.lib.fields import create_hash_digest

SUPPORTED_TYPES = ['problem', 'video', 'html']
SUPPORTED_TYPES = ["problem", "video", "html"]
logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = 'Load sample Component data from course export'
help = "Load sample Component data from course export"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -61,20 +67,19 @@ def init_known_types(self):
# officially "text/javascript"
mimetypes.add_type("text/javascript", ".js")
mimetypes.add_type("text/javascript", ".mjs")


def add_arguments(self, parser):
parser.add_argument('course_data_path', type=pathlib.Path)
parser.add_argument('learning_package_identifier', type=str)
parser.add_argument("course_data_path", type=pathlib.Path)
parser.add_argument("learning_package_identifier", type=str)

def handle(self, course_data_path, learning_package_identifier, **options):
self.course_data_path = course_data_path
self.learning_package_identifier = learning_package_identifier
self.load_course_data(learning_package_identifier)

def get_course_title(self):
course_type_dir = self.course_data_path / 'course'
course_xml_file = next(course_type_dir.glob('*.xml'))
course_type_dir = self.course_data_path / "course"
course_xml_file = next(course_type_dir.glob("*.xml"))
course_root = ET.parse(course_xml_file).getroot()
return course_root.attrib.get("display_name", "Unknown Course")

Expand All @@ -87,9 +92,9 @@ def load_course_data(self, learning_package_identifier):
learning_package, _created = LearningPackage.objects.get_or_create(
identifier=learning_package_identifier,
defaults={
'title': title,
'created': now,
'updated': now,
"title": title,
"created": now,
"updated": now,
},
)
self.learning_package = learning_package
Expand All @@ -105,11 +110,13 @@ def load_course_data(self, learning_package_identifier):
self.import_block_type(block_type, now, publish_log_entry)

def create_content(self, static_local_path, now, component_version):
identifier = pathlib.Path('static') / static_local_path
identifier = pathlib.Path("static") / static_local_path
real_path = self.course_data_path / identifier
mime_type, _encoding = mimetypes.guess_type(identifier)
if mime_type is None:
logger.error(f" no mimetype found for {real_path}, defaulting to application/binary")
logger.error(
f" no mimetype found for {real_path}, defaulting to application/binary"
)
mime_type = "application/binary"

try:
Expand All @@ -120,20 +127,26 @@ def create_content(self, static_local_path, now, component_version):

hash_digest = create_hash_digest(data_bytes)

content, _created = Content.objects.get_or_create(
raw_content, created = RawContent.objects.get_or_create(
learning_package=self.learning_package,
mime_type=mime_type,
hash_digest=hash_digest,
defaults = dict(
data=data_bytes,
defaults=dict(
size=len(data_bytes),
created=now,
)
),
)
ComponentVersionContent.objects.get_or_create(
if created:
raw_content.file.save(
f"{raw_content.learning_package.uuid}/{hash_digest}",
ContentFile(data_bytes),
)

ComponentVersionRawContent.objects.get_or_create(
component_version=component_version,
content=content,
raw_content=raw_content,
identifier=identifier,
learner_downloadable=True,
)

def import_block_type(self, block_type, now, publish_log_entry):
Expand All @@ -146,35 +159,49 @@ def import_block_type(self, block_type, now, publish_log_entry):
static_files_regex = re.compile(r"""['"]\/static\/(.+?)["'\?]""")
block_data_path = self.course_data_path / block_type

for xml_file_path in block_data_path.glob('*.xml'):
for xml_file_path in block_data_path.glob("*.xml"):
components_found += 1
identifier = xml_file_path.stem

# Find or create the Component itself
component, _created = Component.objects.get_or_create(
learning_package=self.learning_package,
namespace='xblock.v1',
namespace="xblock.v1",
type=block_type,
identifier=identifier,
defaults = {
'created': now,
}
defaults={
"created": now,
},
)

# Create the Content entry for the raw data...
# Create the RawContent entry for the raw data...
data_bytes = xml_file_path.read_bytes()
hash_digest = create_hash_digest(data_bytes)
data_str = codecs.decode(data_bytes, 'utf-8')
content, _created = Content.objects.get_or_create(

raw_content, created = RawContent.objects.get_or_create(
learning_package=self.learning_package,
mime_type=f'application/vnd.openedx.xblock.v1.{block_type}+xml',
mime_type=f"application/vnd.openedx.xblock.v1.{block_type}+xml",
hash_digest=hash_digest,
defaults = dict(
data=data_bytes,
defaults=dict(
# text=data_str,
size=len(data_bytes),
created=now,
)
),
)
if created:
raw_content.file.save(
f"{raw_content.learning_package.uuid}/{hash_digest}",
ContentFile(data_bytes),
)

# Decode as utf-8-sig in order to strip any BOM from the data.
data_str = codecs.decode(data_bytes, "utf-8-sig")
TextContent.objects.create(
raw_content=raw_content,
text=data_str,
length=len(data_str),
)

# TODO: Get associated file contents, both with the static regex, as
# well as with XBlock-specific code that examines attributes in
# video and HTML tag definitions.
Expand All @@ -185,7 +212,7 @@ def import_block_type(self, block_type, now, publish_log_entry):
logger.error(f"Parse error for {xml_file_path}: {err}")
continue

display_name = block_root.attrib.get('display_name', "")
display_name = block_root.attrib.get("display_name", "")

# Create the ComponentVersion
component_version = ComponentVersion.objects.create(
Expand All @@ -195,10 +222,11 @@ def import_block_type(self, block_type, now, publish_log_entry):
created=now,
created_by=None,
)
ComponentVersionContent.objects.create(
ComponentVersionRawContent.objects.create(
component_version=component_version,
content=content,
identifier='source.xml',
raw_content=raw_content,
identifier="source.xml",
learner_downloadable=False,
)
static_files_found = static_files_regex.findall(data_str)
for static_local_path in static_files_found:
Expand Down
Empty file.
22 changes: 22 additions & 0 deletions openedx_learning/contrib/media_server/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from django.apps import AppConfig
from django.core.exceptions import ImproperlyConfigured
from django.conf import settings


class MediaServerConfig(AppConfig):
"""
Configuration for the Media Server application.
"""

name = "openedx_learning.contrib.media_server"
verbose_name = "Learning Core: Media Server"
default_auto_field = "django.db.models.BigAutoField"

def ready(self):
if not settings.DEBUG:
# Until we get proper security and support for running this app
# under a separate domain, just don't allow it to be run in
# production environments.
raise ImproperlyConfigured(
"The media_server app should only be run in DEBUG mode!"
)
16 changes: 16 additions & 0 deletions openedx_learning/contrib/media_server/readme.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Media Server App
================

The ``media_server`` app exists to serve media files that are ultimately backed by the RawContent model, *for development purposes and for sites with light-to-moderate traffic*. It also provides an API that can be used by CDNs for high traffic sites.

Motivation
----------

The ``components`` app stores large binary file data by calculating the hash and creating a django-storages backed file named after that hash. This is efficient from a storage point of view, because we don't store redundant copies for every version of a Component. There are at least two drawbacks:

* We have unintelligibly named files that are confusing for clients.
* Intra-file links between media files break. For instance, if we have a piece of HTML that makes a reference to a VTT file, that filename will have changed.

This app tries to bridge that gap by serving URLs that preserve the original file names and give the illusion that there is a seprate set of media files for every version of a Component, but does a lookup behind the scenes to serve the correct hash-based-file.

The big caveat on this is that Django is not really optimized to do this sort of asset serving. The most scalable approach is to have a CDN-backed solution where ``media_server`` serves the locations of files that are converted by worker code to serving the actual assets. (More details to follow when that part gets built out.)
16 changes: 16 additions & 0 deletions openedx_learning/contrib/media_server/urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from django.urls import path

from .views import component_asset

urlpatterns = [
path(
(
"component_asset/"
"<str:learning_package_identifier>/"
"<str:component_identifier>/"
"<int:version_num>/"
"<path:asset_path>"
),
component_asset,
)
]
27 changes: 27 additions & 0 deletions openedx_learning/contrib/media_server/views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
Early views
"""
from django.http import FileResponse
from django.http import Http404
from django.core.exceptions import ObjectDoesNotExist, PermissionDenied

from openedx_learning.core.components.api import get_component_version_content


def component_asset(
request, learning_package_identifier, component_identifier, version_num, asset_path
):
try:
cvc = get_component_version_content(
learning_package_identifier, component_identifier, version_num, asset_path
)
except ObjectDoesNotExist:
raise Http404("File not found")

if not cvc.learner_downloadable and not (
request.user and request.user.is_superuser
):
raise PermissionDenied("This file is not publicly downloadable.")

# TODO: Etag support
return FileResponse(cvc.raw_content.file, filename=cvc.identifier)
9 changes: 9 additions & 0 deletions openedx_learning/contrib/readme.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Contrib Package
===============

The ``contrib`` package holds Django apps that *could* be implemented in separate repos, but are bundled here because it's more convenient to do so.

Guidelines
----------

Nothing from ``lib`` or ``core`` should *ever* import from ``contrib``.
Loading

0 comments on commit d18dc2d

Please sign in to comment.