Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

ElasticSearch models audio #132

Merged
merged 28 commits into from
Aug 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2101fbe
Use the new class name `Document`
dhruvkb Jul 8, 2021
ed95a25
Add documentation linking document fields to Django serializers
dhruvkb Jul 8, 2021
b00d854
Clean up imports
dhruvkb Jul 8, 2021
8ca3267
Extract mapping of common fields to the parent class
dhruvkb Jul 8, 2021
501f75e
Create enum for duration
dhruvkb Jul 8, 2021
c9c17e4
Use parent class method for setting common fields
dhruvkb Jul 8, 2021
b0ec1a2
Merge branch 'main' of https://github.com/WordPress/openverse-api int…
dhruvkb Jul 8, 2021
0ff90f2
Fix code style violations
dhruvkb Jul 8, 2021
441441f
Update time duration ranges
dhruvkb Jul 8, 2021
a535715
Reorder fields to match order in `elasticsearch_models.py`
dhruvkb Jul 9, 2021
b037430
Merge branch 'main' of https://github.com/WordPress/openverse-api int…
dhruvkb Jul 12, 2021
c555622
Align models with their PostgreSQL schemas
dhruvkb Jul 13, 2021
b433256
Use image-like cleanup for audio
dhruvkb Jul 13, 2021
f97d0e7
Define mappings for audio
dhruvkb Jul 13, 2021
4ac4670
Create SQL queries for deleted and mature checks
dhruvkb Jul 13, 2021
31609ca
Use shared deleted/mature check SQL
dhruvkb Jul 13, 2021
0af9089
Fix dynamic SQL bugs (introduced in #117) using `psycopg2` APIs
dhruvkb Jul 13, 2021
03211a3
Merge branch 'main' into es_models_audio
dhruvkb Jul 14, 2021
776cb9a
Remove redundant audio cleanup code in light of #136
dhruvkb Jul 14, 2021
db4db93
Add documentation to ES models
dhruvkb Jul 14, 2021
e3a3e0d
Clean up imports
dhruvkb Jul 14, 2021
7bdadf8
Add documentation about the query
dhruvkb Jul 14, 2021
56d7f3a
Create separate constants for relative hostname and port
dhruvkb Jul 14, 2021
0bb35b2
Replace unsafe template string queries with safe dynamically generate…
dhruvkb Jul 14, 2021
1c3dc0c
Extract queries, refactor code and add stepwise documentation
dhruvkb Jul 14, 2021
952a078
Mention the names of the actual functions used
dhruvkb Jul 14, 2021
07fcddf
Fix code style violations
dhruvkb Jul 14, 2021
9ef51ad
Fix typo where Audio model uses `get_instance_attrs` of Image model
dhruvkb Jul 31, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
300 changes: 212 additions & 88 deletions ingestion_server/ingestion_server/elasticsearch_models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from enum import Enum, auto
from elasticsearch_dsl import Integer, DocType, Field
from ingestion_server.categorize import get_categories

from elasticsearch_dsl import Integer, Document, Field

from ingestion_server.authority import get_authority_boost
from ingestion_server.categorize import get_categories

"""
Provides an ORM-like experience for accessing data in Elasticsearch.
Expand All @@ -27,14 +29,14 @@ def _verify_rank_feature(value, low, high):
return floor


class SyncableDocType(DocType):
class SyncableDocType(Document):
"""
Represents tables in the source-of-truth that will be replicated to
Elasticsearch.
"""
# Aggregations can't be performed on the _id meta-column, which necessitates
# copying it to this column in the doc. Aggregation is used to find the last
# document inserted into Elasticsearch
# Aggregations can't be performed on the _id meta-column, which
# necessitates copying it to this column in the doc. Aggregation is
# used to find the last document inserted into Elasticsearch
id = Integer()

@staticmethod
Expand All @@ -47,75 +49,83 @@ def database_row_to_elasticsearch_doc(row, schema):
:param schema: A map of each field name to its position in the row.
:return:
"""
raise NotImplemented(
raise NotImplementedError(
'Model is missing database -> Elasticsearch translation.'
)


class Image(SyncableDocType):
class Media(SyncableDocType):
"""
Represents an image in Elasticsearch. Note that actual mappings are defined
in `ingestion_server.es_mapping`.
Represents a media object in Elasticsearch. Note that actual mappings
are defined in `ingestion_server.es_mapping`.
"""
class AspectRatios(Enum):
TALL = auto()
WIDE = auto()
SQUARE = auto()

class ImageSizes(Enum):
class Index:
name = 'media'

@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
"""
Maximum threshold for each image size band
Map each row in the downstream database to a Python dictionary that
represents a document in the ElasticSearch index.

:param row: the database row as a tuple obtained by the psycopg2 cursor
:param schema: the mapping of database column names to the tuple index
:return: a dictionary mapping the row tuple to an ES doc
"""
SMALL = 640 * 480
MEDIUM = 1600 * 900
LARGE = float("inf")

class Index:
name = 'image'
raise NotImplementedError(
'Missing database row -> Elasticsearch schema translation.'
)

@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
provider = row[schema['provider']]
source = row[schema['source']]
extension = Image.get_extension(row[schema['url']])
height = row[schema['height']]
width = row[schema['width']]
def get_instance_attrs(row, schema):
"""
Map the common columns in the database row to a Python dictionary that
represents a part of the ES doc.

:param row: the database row as a tuple obtained by the psycopg2 cursor
:param schema: the mapping of database column names to the tuple index
:return: the ES sub-document holding the common cols of the row tuple
"""

meta = row[schema['meta_data']]

if 'standardized_popularity' in schema:
popularity = Image.get_popularity(
popularity = Media.get_popularity(
row[schema['standardized_popularity']]
)
else:
popularity = None
authority_boost = Image.get_authority_boost(meta, provider)
return Image(
_id=row[schema['id']],
id=row[schema['id']],
title=row[schema['title']],
identifier=row[schema['identifier']],
creator=row[schema['creator']],
creator_url=row[schema['creator_url']],
tags=Image.parse_detailed_tags(row[schema['tags']]),
created_on=row[schema['created_on']],
url=row[schema['url']],
thumbnail=row[schema['thumbnail']],
provider=provider,
source=row[schema['source']],
license=row[schema['license']].lower(),
license_version=row[schema['license_version']],
foreign_landing_url=row[schema['foreign_landing_url']],
description=Image.parse_description(meta),
extension=Image.get_extension(row[schema['url']]),
categories=get_categories(extension, source),
aspect_ratio=Image.get_aspect_ratio(height, width),
size=Image.get_size(height, width),
license_url=Image.get_license_url(meta),
mature=Image.get_maturity(meta, row[schema['mature']]),
standardized_popularity=popularity,
authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1)
)

return {
'_id': row[schema['id']],
'id': row[schema['id']],
'identifier': row[schema['identifier']],

'title': row[schema['title']],
'foreign_landing_url': row[schema['foreign_landing_url']],
'description': Media.parse_description(meta),

'creator': row[schema['creator']],
'creator_url': row[schema['creator_url']],

'url': row[schema['url']],
'extension': Media.get_extension(row[schema['url']]),

'license': row[schema['license']].lower(),
'license_version': row[schema['license_version']],
'license_url': Media.get_license_url(meta),

'provider': row[schema['provider']],
'source': row[schema['source']],

'created_on': row[schema['created_on']],
'tags': Media.parse_detailed_tags(row[schema['tags']]),
'mature': Media.get_maturity(meta, row[schema['mature']]),

'standardized_popularity': popularity,
}

@staticmethod
def parse_description(metadata_field):
Expand All @@ -132,33 +142,15 @@ def parse_description(metadata_field):

@staticmethod
def get_extension(url):
"""
Get the extension from the last segment of the URL separated by a dot.
"""
extension = url.split('.')[-1].lower()
if '/' in extension or extension is None:
return None
else:
return extension

@staticmethod
def get_aspect_ratio(height, width):
if height is None or width is None:
return None
elif height > width:
aspect_ratio = Image.AspectRatios.TALL.name
elif height < width:
aspect_ratio = Image.AspectRatios.WIDE.name
else:
aspect_ratio = Image.AspectRatios.SQUARE.name
return aspect_ratio.lower()

@staticmethod
def get_size(height, width):
if height is None or width is None:
return None
resolution = height * width
for size in Image.ImageSizes:
if resolution < size.value:
return size.name.lower()

@staticmethod
def get_license_url(meta_data):
"""
Expand Down Expand Up @@ -212,20 +204,152 @@ def get_popularity(raw):

@staticmethod
def parse_detailed_tags(json_tags):
if json_tags:
parsed_tags = []
for tag in json_tags:
if 'name' in tag:
parsed_tag = {'name': tag['name']}
if 'accuracy' in tag:
parsed_tag['accuracy'] = tag['accuracy']
parsed_tags.append(parsed_tag)
return parsed_tags
if not json_tags:
return None
parsed_tags = []
for tag in json_tags:
if 'name' in tag:
parsed_tag = {'name': tag['name']}
if 'accuracy' in tag:
parsed_tag['accuracy'] = tag['accuracy']
parsed_tags.append(parsed_tag)
return parsed_tags


class Image(Media):
"""
Represents an image in Elasticsearch. Note that actual mappings are defined
in `ingestion_server.es_mapping`.
"""

class AspectRatios(Enum):
"""
These aspect ratios are also hardcoded in the `aspect_ratio` field in
openverse-api/catalog/api/serializers/image_serializers.py.
"""
TALL = auto()
WIDE = auto()
SQUARE = auto()

class ImageSizes(Enum):
"""
Maximum threshold for each image size band

These sizes are also hardcoded in the `aspect_ratio` field in
openverse-api/catalog/api/serializers/image_serializers.py.
"""
SMALL = 640 * 480
MEDIUM = 1600 * 900
LARGE = float("inf")

class Index:
name = 'image'

@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
source = row[schema['source']]
extension = Image.get_extension(row[schema['url']])
categories = get_categories(extension, source)

height = row[schema['height']]
width = row[schema['width']]
aspect_ratio = Image.get_aspect_ratio(height, width)
size = Image.get_size(height, width)

meta = row[schema['meta_data']]
provider = row[schema['provider']]
authority_boost = Image.get_authority_boost(meta, provider)

attrs = Image.get_instance_attrs(row, schema)
popularity = attrs['standardized_popularity']

return Image(
thumbnail=row[schema['thumbnail']],

categories=categories,
aspect_ratio=aspect_ratio,
size=size,

authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1),
**attrs,
)

@staticmethod
def get_aspect_ratio(height, width):
if height is None or width is None:
return None
elif height > width:
aspect_ratio = Image.AspectRatios.TALL.name
elif height < width:
aspect_ratio = Image.AspectRatios.WIDE.name
else:
aspect_ratio = Image.AspectRatios.SQUARE.name
return aspect_ratio.lower()

@staticmethod
def get_size(height, width):
if height is None or width is None:
return None
resolution = height * width
for size in Image.ImageSizes:
if resolution < size.value:
return size.name.lower()


class Audio(Media):
"""
Represents an audio in Elasticsearch. Note that actual mappings are defined
in `ingestion_server.es_mapping`.
"""

class Durations(Enum):
"""
Maximum threshold for each audio duration band

These durations are also hardcoded in the `duration` field in
openverse-api/catalog/api/serializers/audio_serializers.py.
"""
SHORT = 4 * 60 * 1e3 # under 4 minutes
MEDIUM = 20 * 60 * 1e3 # 4 - 20 minutes
LONG = float("inf") # longer than 20 minutes

class Index:
name = 'audio'

@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
meta = row[schema['meta_data']]
provider = row[schema['provider']]
authority_boost = Audio.get_authority_boost(meta, provider)

attrs = Audio.get_instance_attrs(row, schema)
popularity = attrs['standardized_popularity']

return Audio(
bit_rate=row[schema['bit_rate']],
sample_rate=row[schema['sample_rate']],
genres=row[schema['genres']],
category=row[schema['category']],

authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1),
**attrs,
)

@staticmethod
def get_duration(duration):
if not duration:
return None
for length in Audio.Durations:
if duration < length.value:
return length.name.lower()


# Table name -> Elasticsearch model
database_table_to_elasticsearch_model = {
'image': Image
'image': Image,
'audio': Audio,
}
Loading