Skip to content

Commit

Permalink
Reduce ES shard count and simplify index properties (#3143)
Browse files Browse the repository at this point in the history
  • Loading branch information
dhruvkb authored Oct 19, 2023
1 parent 0bd4f63 commit be0256c
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 101 deletions.
63 changes: 28 additions & 35 deletions ingestion_server/ingestion_server/elasticsearch_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,26 +97,38 @@ def get_instance_attrs(row, schema):
# cleanup tests in CI: test/unit_tests/test_cleanup.py
category = row[schema["category"]] if "category" in schema else None

provider = row[schema["provider"]]
authority_boost = Media.get_authority_boost(meta, provider)

# This matches the order of fields defined in ``es_mapping.py``.
return {
"_id": row[schema["id"]],
"id": row[schema["id"]],
"created_on": row[schema["created_on"]],
"mature": Media.get_maturity(meta, row[schema["mature"]]),
# Keyword fields
"identifier": row[schema["identifier"]],
"license": row[schema["license"]].lower(),
"provider": provider,
"source": row[schema["source"]],
"category": category,
# Text-based fields
"title": row[schema["title"]],
"foreign_landing_url": row[schema["foreign_landing_url"]],
"description": Media.parse_description(meta),
"creator": row[schema["creator"]],
"creator_url": row[schema["creator_url"]],
# Rank feature fields
"standardized_popularity": popularity,
"authority_boost": authority_boost,
"max_boost": max(popularity or 1, authority_boost or 1),
"min_boost": min(popularity or 1, authority_boost or 1),
# Nested fields
"tags": Media.parse_detailed_tags(row[schema["tags"]]),
# Extra fields, not indexed
"url": row[schema["url"]],
"license": row[schema["license"]].lower(),
"foreign_landing_url": row[schema["foreign_landing_url"]],
"creator_url": row[schema["creator_url"]],
"license_version": row[schema["license_version"]],
"license_url": Media.get_license_url(meta),
"provider": row[schema["provider"]],
"source": row[schema["source"]],
"category": category,
"created_on": row[schema["created_on"]],
"tags": Media.parse_detailed_tags(row[schema["tags"]]),
"mature": Media.get_maturity(meta, row[schema["mature"]]),
"standardized_popularity": popularity,
}

@staticmethod
Expand Down Expand Up @@ -230,28 +242,18 @@ class Index:
@staticmethod
def database_row_to_elasticsearch_doc(row, schema):
extension = Image.get_extension(row[schema["url"]])

height = row[schema["height"]]
width = row[schema["width"]]
aspect_ratio = Image.get_aspect_ratio(height, width)
size = Image.get_size(height, width)

meta = row[schema["meta_data"]]
provider = row[schema["provider"]]
authority_boost = Image.get_authority_boost(meta, provider)

attrs = Image.get_instance_attrs(row, schema)
attrs["category"] = attrs["category"]
popularity = attrs["standardized_popularity"]

return Image(
thumbnail=row[schema["thumbnail"]],
aspect_ratio=aspect_ratio,
extension=extension,
size=size,
authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1),
# Extra fields, not indexed
thumbnail=row[schema["thumbnail"]],
**attrs,
)

Expand Down Expand Up @@ -319,27 +321,18 @@ def database_row_to_elasticsearch_doc(row, schema):
alt_files = row[schema["alt_files"]]
filetype = row[schema["filetype"]]
extension = Audio.get_extensions(filetype, alt_files)

meta = row[schema["meta_data"]]
provider = row[schema["provider"]]
authority_boost = Audio.get_authority_boost(meta, provider)

attrs = Audio.get_instance_attrs(row, schema)
popularity = attrs["standardized_popularity"]

length = Audio.get_length(row[schema["duration"]])

return Audio(
length=length,
filetype=filetype,
extension=extension,
# Extra fields, not indexed
bit_rate=row[schema["bit_rate"]],
sample_rate=row[schema["sample_rate"]],
genres=row[schema["genres"]],
duration=row[schema["duration"]],
length=length,
filetype=filetype,
extension=extension,
authority_boost=authority_boost,
max_boost=max(popularity or 1, authority_boost or 1),
min_boost=min(popularity or 1, authority_boost or 1),
**attrs,
)

Expand Down
121 changes: 55 additions & 66 deletions ingestion_server/ingestion_server/es_mapping.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
def index_settings(table_name):
from ingestion_server.constants.media_types import AUDIO_TYPE, IMAGE_TYPE, MediaType


def index_settings(media_type: MediaType):
"""
Return the Elasticsearch mapping for a given table in the database.
:param table_name: The name of the table in the upstream database.
:return:
:param media_type: The name of the table in the upstream database.
:return: the settings for the ES mapping
"""

number_of_shards: dict[MediaType, int] = {
IMAGE_TYPE: 18,
AUDIO_TYPE: 1,
}

settings = {
"index": {
"number_of_shards": 18,
"number_of_shards": number_of_shards[media_type],
"number_of_replicas": 0,
"refresh_interval": "-1",
},
Expand Down Expand Up @@ -51,109 +60,89 @@ def index_settings(table_name):
},
}
common_mappings = {
"dynamic": False, # extra fields are stored in ``_source`` but not indexed
"properties": {
"id": {"type": "long"},
"created_on": {"type": "date"},
"mature": {"type": "boolean"},
# Keyword fields
"identifier": {
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
"type": "text",
# TODO: Remove subfield when API is updated
"fields": {"keyword": {"type": "keyword"}},
"type": "keyword",
},
"extension": {"type": "keyword"},
"license": {
# TODO: Remove subfield when API is updated
"fields": {"keyword": {"type": "keyword"}},
"type": "keyword",
},
"provider": {"type": "keyword"},
"source": {
# TODO: Remove subfield when API is updated
"fields": {"keyword": {"type": "keyword"}},
"type": "keyword",
},
"filetype": {"type": "keyword"},
"category": {"type": "keyword"},
# Text-based fields
"title": {
"type": "text",
"analyzer": "custom_english",
"similarity": "boolean",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"raw": {"type": "text", "index": True},
},
"analyzer": "custom_english",
},
"foreign_landing_url": {
"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
"type": "text",
},
"description": {
"fields": {
"keyword": {"type": "keyword", "similarity": "boolean"},
"raw": {"type": "text", "index": True},
},
"type": "text",
"analyzer": "custom_english",
"similarity": "boolean",
"fields": {"raw": {"type": "text", "index": True}},
},
"creator": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
},
"url": {
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
"type": "text",
},
"extension": {
"fields": {"keyword": {"ignore_above": 8, "type": "keyword"}},
"type": "text",
},
"license": {
"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
"type": "text",
},
"license_version": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
},
"license_url": {
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
"type": "text",
},
"provider": {
"type": "text",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
},
"source": {
"fields": {"keyword": {"ignore_above": 256, "type": "keyword"}},
"type": "text",
# Rank feature fields
"standardized_popularity": {"type": "rank_feature"},
"authority_boost": {"type": "rank_feature"},
"authority_penalty": {
"type": "rank_feature",
"positive_score_impact": False,
},
"filetype": {"type": "keyword"},
"created_on": {"type": "date"},
"max_boost": {"type": "rank_feature"},
"min_boost": {"type": "rank_feature"},
# Nested fields
"tags": {
"properties": {
"accuracy": {"type": "float"},
# Text-based fields
"name": {
"type": "text",
"analyzer": "custom_english",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"raw": {"type": "text", "index": True},
},
"analyzer": "custom_english",
},
}
},
"mature": {"type": "boolean"},
"standardized_popularity": {"type": "rank_feature"},
"authority_boost": {"type": "rank_feature"},
"authority_penalty": {
"type": "rank_feature",
"positive_score_impact": False,
},
"max_boost": {"type": "rank_feature"},
"min_boost": {"type": "rank_feature"},
"category": {"type": "keyword"},
}
},
}
media_properties = {
"image": {
"aspect_ratio": {
"fields": {"keyword": {"type": "keyword"}},
"type": "text",
},
"size": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
# Keyword fields
"aspect_ratio": {"type": "keyword"},
"size": {"type": "keyword"},
},
"audio": {
"bit_rate": {"type": "integer"},
"sample_rate": {"type": "integer"},
"genres": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"},
"duration": {"type": "integer"},
# Keyword fields
"length": {"type": "keyword"},
},
}
media_mappings = common_mappings.copy()
media_mappings["properties"].update(media_properties[table_name])
media_mappings["properties"].update(media_properties[media_type])
result = {"settings": settings.copy(), "mappings": media_mappings}
return result

0 comments on commit be0256c

Please sign in to comment.