Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Commit

Permalink
Merge pull request #132 from WordPress/es_models_audio
Browse files Browse the repository at this point in the history
  • Loading branch information
dhruvkb authored Aug 3, 2021
2 parents c28b9df + 9ef51ad commit 864ea38
Show file tree
Hide file tree
Showing 6 changed files with 756 additions and 428 deletions.
300 changes: 212 additions & 88 deletions ingestion_server/ingestion_server/elasticsearch_models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from enum import Enum, auto
from elasticsearch_dsl import Integer, DocType, Field
from ingestion_server.categorize import get_categories

from elasticsearch_dsl import Integer, Document, Field

from ingestion_server.authority import get_authority_boost
from ingestion_server.categorize import get_categories

"""
Provides an ORM-like experience for accessing data in Elasticsearch.
Expand All @@ -27,14 +29,14 @@ def _verify_rank_feature(value, low, high):
return floor


class SyncableDocType(DocType):
class SyncableDocType(Document):
"""
Represents tables in the source-of-truth that will be replicated to
Elasticsearch.
"""
# Aggregations can't be performed on the _id meta-column, which necessitates
# copying it to this column in the doc. Aggregation is used to find the last
# document inserted into Elasticsearch
# Aggregations can't be performed on the _id meta-column, which
# necessitates copying it to this column in the doc. Aggregation is
# used to find the last document inserted into Elasticsearch
id = Integer()

@staticmethod
Expand All @@ -47,75 +49,83 @@ def database_row_to_elasticsearch_doc(row, schema):
:param schema: A map of each field name to its position in the row.
:return:
"""
raise NotImplemented(
raise NotImplementedError(
'Model is missing database -> Elasticsearch translation.'
)


class Image(SyncableDocType):
class Media(SyncableDocType):
"""
Represents an image in Elasticsearch. Note that actual mappings are defined
in `ingestion_server.es_mapping`.
Represents a media object in Elasticsearch. Note that actual mappings
are defined in `ingestion_server.es_mapping`.
"""
class AspectRatios(Enum):
TALL = auto()
WIDE = auto()
SQUARE = auto()

class ImageSizes(Enum):
class Index:
name = 'media'

@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
"""
Maximum threshold for each image size band
Map each row in the downstream database to a Python dictionary that
represents a document in the ElasticSearch index.
:param row: the database row as a tuple obtained by the psycopg2 cursor
:param schema: the mapping of database column names to the tuple index
:return: a dictionary mapping the row tuple to an ES doc
"""
SMALL = 640 * 480
MEDIUM = 1600 * 900
LARGE = float("inf")

class Index:
name = 'image'
raise NotImplementedError(
'Missing database row -> Elasticsearch schema translation.'
)

@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
provider = row[schema['provider']]
source = row[schema['source']]
extension = Image.get_extension(row[schema['url']])
height = row[schema['height']]
width = row[schema['width']]
def get_instance_attrs(row, schema):
"""
Map the common columns in the database row to a Python dictionary that
represents a part of the ES doc.
:param row: the database row as a tuple obtained by the psycopg2 cursor
:param schema: the mapping of database column names to the tuple index
:return: the ES sub-document holding the common cols of the row tuple
"""

meta = row[schema['meta_data']]

if 'standardized_popularity' in schema:
popularity = Image.get_popularity(
popularity = Media.get_popularity(
row[schema['standardized_popularity']]
)
else:
popularity = None
authority_boost = Image.get_authority_boost(meta, provider)
return Image(
_id=row[schema['id']],
id=row[schema['id']],
title=row[schema['title']],
identifier=row[schema['identifier']],
creator=row[schema['creator']],
creator_url=row[schema['creator_url']],
tags=Image.parse_detailed_tags(row[schema['tags']]),
created_on=row[schema['created_on']],
url=row[schema['url']],
thumbnail=row[schema['thumbnail']],
provider=provider,
source=row[schema['source']],
license=row[schema['license']].lower(),
license_version=row[schema['license_version']],
foreign_landing_url=row[schema['foreign_landing_url']],
description=Image.parse_description(meta),
extension=Image.get_extension(row[schema['url']]),
categories=get_categories(extension, source),
aspect_ratio=Image.get_aspect_ratio(height, width),
size=Image.get_size(height, width),
license_url=Image.get_license_url(meta),
mature=Image.get_maturity(meta, row[schema['mature']]),
standardized_popularity=popularity,
authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1)
)

return {
'_id': row[schema['id']],
'id': row[schema['id']],
'identifier': row[schema['identifier']],

'title': row[schema['title']],
'foreign_landing_url': row[schema['foreign_landing_url']],
'description': Media.parse_description(meta),

'creator': row[schema['creator']],
'creator_url': row[schema['creator_url']],

'url': row[schema['url']],
'extension': Media.get_extension(row[schema['url']]),

'license': row[schema['license']].lower(),
'license_version': row[schema['license_version']],
'license_url': Media.get_license_url(meta),

'provider': row[schema['provider']],
'source': row[schema['source']],

'created_on': row[schema['created_on']],
'tags': Media.parse_detailed_tags(row[schema['tags']]),
'mature': Media.get_maturity(meta, row[schema['mature']]),

'standardized_popularity': popularity,
}

@staticmethod
def parse_description(metadata_field):
Expand All @@ -132,33 +142,15 @@ def parse_description(metadata_field):

@staticmethod
def get_extension(url):
"""
Get the extension from the last segment of the URL separated by a dot.
"""
extension = url.split('.')[-1].lower()
if '/' in extension or extension is None:
return None
else:
return extension

@staticmethod
def get_aspect_ratio(height, width):
if height is None or width is None:
return None
elif height > width:
aspect_ratio = Image.AspectRatios.TALL.name
elif height < width:
aspect_ratio = Image.AspectRatios.WIDE.name
else:
aspect_ratio = Image.AspectRatios.SQUARE.name
return aspect_ratio.lower()

@staticmethod
def get_size(height, width):
if height is None or width is None:
return None
resolution = height * width
for size in Image.ImageSizes:
if resolution < size.value:
return size.name.lower()

@staticmethod
def get_license_url(meta_data):
"""
Expand Down Expand Up @@ -212,20 +204,152 @@ def get_popularity(raw):

@staticmethod
def parse_detailed_tags(json_tags):
if json_tags:
parsed_tags = []
for tag in json_tags:
if 'name' in tag:
parsed_tag = {'name': tag['name']}
if 'accuracy' in tag:
parsed_tag['accuracy'] = tag['accuracy']
parsed_tags.append(parsed_tag)
return parsed_tags
if not json_tags:
return None
parsed_tags = []
for tag in json_tags:
if 'name' in tag:
parsed_tag = {'name': tag['name']}
if 'accuracy' in tag:
parsed_tag['accuracy'] = tag['accuracy']
parsed_tags.append(parsed_tag)
return parsed_tags


class Image(Media):
"""
Represents an image in Elasticsearch. Note that actual mappings are defined
in `ingestion_server.es_mapping`.
"""

class AspectRatios(Enum):
"""
These aspect ratios are also hardcoded in the `aspect_ratio` field in
openverse-api/catalog/api/serializers/image_serializers.py.
"""
TALL = auto()
WIDE = auto()
SQUARE = auto()

class ImageSizes(Enum):
"""
Maximum threshold for each image size band
These sizes are also hardcoded in the `aspect_ratio` field in
openverse-api/catalog/api/serializers/image_serializers.py.
"""
SMALL = 640 * 480
MEDIUM = 1600 * 900
LARGE = float("inf")

class Index:
name = 'image'

@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
source = row[schema['source']]
extension = Image.get_extension(row[schema['url']])
categories = get_categories(extension, source)

height = row[schema['height']]
width = row[schema['width']]
aspect_ratio = Image.get_aspect_ratio(height, width)
size = Image.get_size(height, width)

meta = row[schema['meta_data']]
provider = row[schema['provider']]
authority_boost = Image.get_authority_boost(meta, provider)

attrs = Image.get_instance_attrs(row, schema)
popularity = attrs['standardized_popularity']

return Image(
thumbnail=row[schema['thumbnail']],

categories=categories,
aspect_ratio=aspect_ratio,
size=size,

authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1),
**attrs,
)

@staticmethod
def get_aspect_ratio(height, width):
if height is None or width is None:
return None
elif height > width:
aspect_ratio = Image.AspectRatios.TALL.name
elif height < width:
aspect_ratio = Image.AspectRatios.WIDE.name
else:
aspect_ratio = Image.AspectRatios.SQUARE.name
return aspect_ratio.lower()

@staticmethod
def get_size(height, width):
if height is None or width is None:
return None
resolution = height * width
for size in Image.ImageSizes:
if resolution < size.value:
return size.name.lower()


class Audio(Media):
"""
Represents an audio in Elasticsearch. Note that actual mappings are defined
in `ingestion_server.es_mapping`.
"""

class Durations(Enum):
"""
Maximum threshold for each audio duration band
These durations are also hardcoded in the `duration` field in
openverse-api/catalog/api/serializers/audio_serializers.py.
"""
SHORT = 4 * 60 * 1e3 # under 4 minutes
MEDIUM = 20 * 60 * 1e3 # 4 - 20 minutes
LONG = float("inf") # longer than 20 minutes

class Index:
name = 'audio'

@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
meta = row[schema['meta_data']]
provider = row[schema['provider']]
authority_boost = Audio.get_authority_boost(meta, provider)

attrs = Audio.get_instance_attrs(row, schema)
popularity = attrs['standardized_popularity']

return Audio(
bit_rate=row[schema['bit_rate']],
sample_rate=row[schema['sample_rate']],
genres=row[schema['genres']],
category=row[schema['category']],

authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1),
**attrs,
)

@staticmethod
def get_duration(duration):
if not duration:
return None
for length in Audio.Durations:
if duration < length.value:
return length.name.lower()


# Table name -> Elasticsearch model
database_table_to_elasticsearch_model = {
'image': Image
'image': Image,
'audio': Audio,
}
Loading

0 comments on commit 864ea38

Please sign in to comment.