WordPress · dhruvkb · Aug 3, 2021 · Jul 8, 2021 · Jul 8, 2021 · Jul 8, 2021
@@ -1,7 +1,9 @@
 from enum import Enum, auto
-from elasticsearch_dsl import Integer, DocType, Field
-from ingestion_server.categorize import get_categories
+
+from elasticsearch_dsl import Integer, Document, Field
+
 from ingestion_server.authority import get_authority_boost
+from ingestion_server.categorize import get_categories
 
 """
 Provides an ORM-like experience for accessing data in Elasticsearch.
@@ -27,14 +29,14 @@ def _verify_rank_feature(value, low, high):
     return floor
 
 
-class SyncableDocType(DocType):
+class SyncableDocType(Document):
     """
     Represents tables in the source-of-truth that will be replicated to
     Elasticsearch.
     """
-    # Aggregations can't be performed on the _id meta-column, which necessitates
-    # copying it to this column in the doc. Aggregation is used to find the last
-    # document inserted into Elasticsearch
+    # Aggregations can't be performed on the _id meta-column, which
+    # necessitates copying it to this column in the doc. Aggregation is
+    # used to find the last document inserted into Elasticsearch
     id = Integer()
 
     @staticmethod
@@ -47,75 +49,83 @@ def database_row_to_elasticsearch_doc(row, schema):
         :param schema: A map of each field name to its position in the row.
         :return:
         """
-        raise NotImplemented(
+        raise NotImplementedError(
             'Model is missing database -> Elasticsearch translation.'
         )
 
 
-class Image(SyncableDocType):
+class Media(SyncableDocType):
     """
-    Represents an image in Elasticsearch. Note that actual mappings are defined
-    in `ingestion_server.es_mapping`.
+    Represents a media object in Elasticsearch. Note that actual mappings
+    are defined in `ingestion_server.es_mapping`.
     """
-    class AspectRatios(Enum):
-        TALL = auto()
-        WIDE = auto()
-        SQUARE = auto()
 
-    class ImageSizes(Enum):
+    class Index:
+        name = 'media'
+
+    @staticmethod
+    def database_row_to_elasticsearch_doc(row, schema):
         """
-        Maximum threshold for each image size band
+        Map each row in the downstream database to a Python dictionary that
+        represents a document in the ElasticSearch index.
+
+        :param row: the database row as a tuple obtained by the psycopg2 cursor
+        :param schema: the mapping of database column names to the tuple index
+        :return: a dictionary mapping the row tuple to an ES doc
         """
-        SMALL = 640 * 480
-        MEDIUM = 1600 * 900
-        LARGE = float("inf")
 
-    class Index:
-        name = 'image'
+        raise NotImplementedError(
+            'Missing database row -> Elasticsearch schema translation.'
+        )
 
     @staticmethod
-    def database_row_to_elasticsearch_doc(row, schema):
-        provider = row[schema['provider']]
-        source = row[schema['source']]
-        extension = Image.get_extension(row[schema['url']])
-        height = row[schema['height']]
-        width = row[schema['width']]
+    def get_instance_attrs(row, schema):
+        """
+        Map the common columns in the database row to a Python dictionary that
+        represents a part of the ES doc.
+
+        :param row: the database row as a tuple obtained by the psycopg2 cursor
+        :param schema: the mapping of database column names to the tuple index
+        :return: the ES sub-document holding the common cols of the row tuple
+        """
+
         meta = row[schema['meta_data']]
+
         if 'standardized_popularity' in schema:
-            popularity = Image.get_popularity(
+            popularity = Media.get_popularity(
                 row[schema['standardized_popularity']]
             )
         else:
             popularity = None
-        authority_boost = Image.get_authority_boost(meta, provider)
-        return Image(
-            _id=row[schema['id']],
-            id=row[schema['id']],
-            title=row[schema['title']],
-            identifier=row[schema['identifier']],
-            creator=row[schema['creator']],
-            creator_url=row[schema['creator_url']],
-            tags=Image.parse_detailed_tags(row[schema['tags']]),
-            created_on=row[schema['created_on']],
-            url=row[schema['url']],
-            thumbnail=row[schema['thumbnail']],
-            provider=provider,
-            source=row[schema['source']],
-            license=row[schema['license']].lower(),
-            license_version=row[schema['license_version']],
-            foreign_landing_url=row[schema['foreign_landing_url']],
-            description=Image.parse_description(meta),
-            extension=Image.get_extension(row[schema['url']]),
-            categories=get_categories(extension, source),
-            aspect_ratio=Image.get_aspect_ratio(height, width),
-            size=Image.get_size(height, width),
-            license_url=Image.get_license_url(meta),
-            mature=Image.get_maturity(meta, row[schema['mature']]),
-            standardized_popularity=popularity,
-            authority_boost=authority_boost,
-            max_boost=max(popularity or 1, authority_boost or 1),
-            min_boost=min(popularity or 1, authority_boost or 1)
-        )
+
+        return {
+            '_id': row[schema['id']],
+            'id': row[schema['id']],
+            'identifier': row[schema['identifier']],
+
+            'title': row[schema['title']],
+            'foreign_landing_url': row[schema['foreign_landing_url']],
+            'description': Media.parse_description(meta),
+
+            'creator': row[schema['creator']],
+            'creator_url': row[schema['creator_url']],
+
+            'url': row[schema['url']],
+            'extension': Media.get_extension(row[schema['url']]),
+
+            'license': row[schema['license']].lower(),
+            'license_version': row[schema['license_version']],
+            'license_url': Media.get_license_url(meta),
+
+            'provider': row[schema['provider']],
+            'source': row[schema['source']],
+
+            'created_on': row[schema['created_on']],
+            'tags': Media.parse_detailed_tags(row[schema['tags']]),
+            'mature': Media.get_maturity(meta, row[schema['mature']]),
+
+            'standardized_popularity': popularity,
+        }
 
     @staticmethod
     def parse_description(metadata_field):
@@ -132,33 +142,15 @@ def parse_description(metadata_field):
 
     @staticmethod
     def get_extension(url):
+        """
+        Get the extension from the last segment of the URL separated by a dot.
+        """
         extension = url.split('.')[-1].lower()
         if '/' in extension or extension is None:
             return None
         else:
             return extension
 
-    @staticmethod
-    def get_aspect_ratio(height, width):
-        if height is None or width is None:
-            return None
-        elif height > width:
-            aspect_ratio = Image.AspectRatios.TALL.name
-        elif height < width:
-            aspect_ratio = Image.AspectRatios.WIDE.name
-        else:
-            aspect_ratio = Image.AspectRatios.SQUARE.name
-        return aspect_ratio.lower()
-
-    @staticmethod
-    def get_size(height, width):
-        if height is None or width is None:
-            return None
-        resolution = height * width
-        for size in Image.ImageSizes:
-            if resolution < size.value:
-                return size.name.lower()
-
     @staticmethod
     def get_license_url(meta_data):
         """
@@ -212,20 +204,152 @@ def get_popularity(raw):
 
     @staticmethod
     def parse_detailed_tags(json_tags):
-        if json_tags:
-            parsed_tags = []
-            for tag in json_tags:
-                if 'name' in tag:
-                    parsed_tag = {'name': tag['name']}
-                    if 'accuracy' in tag:
-                        parsed_tag['accuracy'] = tag['accuracy']
-                    parsed_tags.append(parsed_tag)
-            return parsed_tags
+        if not json_tags:
+            return None
+        parsed_tags = []
+        for tag in json_tags:
+            if 'name' in tag:
+                parsed_tag = {'name': tag['name']}
+                if 'accuracy' in tag:
+                    parsed_tag['accuracy'] = tag['accuracy']
+                parsed_tags.append(parsed_tag)
+        return parsed_tags
+
+
+class Image(Media):
+    """
+    Represents an image in Elasticsearch. Note that actual mappings are defined
+    in `ingestion_server.es_mapping`.
+    """
+
+    class AspectRatios(Enum):
+        """
+        These aspect ratios are also hardcoded in the `aspect_ratio` field in
+        openverse-api/catalog/api/serializers/image_serializers.py.
+        """
+        TALL = auto()
+        WIDE = auto()
+        SQUARE = auto()
+
+    class ImageSizes(Enum):
+        """
+        Maximum threshold for each image size band
+
+        These sizes are also hardcoded in the `aspect_ratio` field in
+        openverse-api/catalog/api/serializers/image_serializers.py.
+        """
+        SMALL = 640 * 480
+        MEDIUM = 1600 * 900
+        LARGE = float("inf")
+
+    class Index:
+        name = 'image'
+
+    @staticmethod
+    def database_row_to_elasticsearch_doc(row, schema):
+        source = row[schema['source']]
+        extension = Image.get_extension(row[schema['url']])
+        categories = get_categories(extension, source)
+
+        height = row[schema['height']]
+        width = row[schema['width']]
+        aspect_ratio = Image.get_aspect_ratio(height, width)
+        size = Image.get_size(height, width)
+
+        meta = row[schema['meta_data']]
+        provider = row[schema['provider']]
+        authority_boost = Image.get_authority_boost(meta, provider)
+
+        attrs = Image.get_instance_attrs(row, schema)
+        popularity = attrs['standardized_popularity']
+
+        return Image(
+            thumbnail=row[schema['thumbnail']],
+
+            categories=categories,
+            aspect_ratio=aspect_ratio,
+            size=size,
+
+            authority_boost=authority_boost,
+            max_boost=max(popularity or 1, authority_boost or 1),
+            min_boost=min(popularity or 1, authority_boost or 1),
+            **attrs,
+        )
+
+    @staticmethod
+    def get_aspect_ratio(height, width):
+        if height is None or width is None:
+            return None
+        elif height > width:
+            aspect_ratio = Image.AspectRatios.TALL.name
+        elif height < width:
+            aspect_ratio = Image.AspectRatios.WIDE.name
         else:
+            aspect_ratio = Image.AspectRatios.SQUARE.name
+        return aspect_ratio.lower()
+
+    @staticmethod
+    def get_size(height, width):
+        if height is None or width is None:
+            return None
+        resolution = height * width
+        for size in Image.ImageSizes:
+            if resolution < size.value:
+                return size.name.lower()
+
+
+class Audio(Media):
+    """
+    Represents an audio in Elasticsearch. Note that actual mappings are defined
+    in `ingestion_server.es_mapping`.
+    """
+
+    class Durations(Enum):
+        """
+        Maximum threshold for each audio duration band
+
+        These durations are also hardcoded in the `duration` field in
+        openverse-api/catalog/api/serializers/audio_serializers.py.
+        """
+        SHORT = 4 * 60 * 1e3  # under 4 minutes
+        MEDIUM = 20 * 60 * 1e3  # 4 - 20 minutes
+        LONG = float("inf")  # longer than 20 minutes
+
+    class Index:
+        name = 'audio'
+
+    @staticmethod
+    def database_row_to_elasticsearch_doc(row, schema):
+        meta = row[schema['meta_data']]
+        provider = row[schema['provider']]
+        authority_boost = Audio.get_authority_boost(meta, provider)
+
+        attrs = Audio.get_instance_attrs(row, schema)
+        popularity = attrs['standardized_popularity']
+
+        return Audio(
+            bit_rate=row[schema['bit_rate']],
+            sample_rate=row[schema['sample_rate']],
+            genres=row[schema['genres']],
+            category=row[schema['category']],
+
+            authority_boost=authority_boost,
+            max_boost=max(popularity or 1, authority_boost or 1),
+            min_boost=min(popularity or 1, authority_boost or 1),
+            **attrs,
+        )
+
+    @staticmethod
+    def get_duration(duration):
+        if not duration:
             return None
+        for length in Audio.Durations:
+            if duration < length.value:
+                return length.name.lower()
 
 
 # Table name -> Elasticsearch model
 database_table_to_elasticsearch_model = {
-    'image': Image
+    'image': Image,
+    'audio': Audio,
 }