From be0256c7671e52a57fcbba060d68b3eda874d988 Mon Sep 17 00:00:00 2001 From: Dhruv Bhanushali Date: Thu, 19 Oct 2023 09:50:15 +0400 Subject: [PATCH] Reduce ES shard count and simplify index properties (#3143) --- .../ingestion_server/elasticsearch_models.py | 63 ++++----- .../ingestion_server/es_mapping.py | 121 ++++++++---------- 2 files changed, 83 insertions(+), 101 deletions(-) diff --git a/ingestion_server/ingestion_server/elasticsearch_models.py b/ingestion_server/ingestion_server/elasticsearch_models.py index 0397830f639..32fc8491cd6 100644 --- a/ingestion_server/ingestion_server/elasticsearch_models.py +++ b/ingestion_server/ingestion_server/elasticsearch_models.py @@ -97,26 +97,38 @@ def get_instance_attrs(row, schema): # cleanup tests in CI: test/unit_tests/test_cleanup.py category = row[schema["category"]] if "category" in schema else None + provider = row[schema["provider"]] + authority_boost = Media.get_authority_boost(meta, provider) + + # This matches the order of fields defined in ``es_mapping.py``. return { "_id": row[schema["id"]], "id": row[schema["id"]], + "created_on": row[schema["created_on"]], + "mature": Media.get_maturity(meta, row[schema["mature"]]), + # Keyword fields "identifier": row[schema["identifier"]], + "license": row[schema["license"]].lower(), + "provider": provider, + "source": row[schema["source"]], + "category": category, + # Text-based fields "title": row[schema["title"]], - "foreign_landing_url": row[schema["foreign_landing_url"]], "description": Media.parse_description(meta), "creator": row[schema["creator"]], - "creator_url": row[schema["creator_url"]], + # Rank feature fields + "standardized_popularity": popularity, + "authority_boost": authority_boost, + "max_boost": max(popularity or 1, authority_boost or 1), + "min_boost": min(popularity or 1, authority_boost or 1), + # Nested fields + "tags": Media.parse_detailed_tags(row[schema["tags"]]), + # Extra fields, not indexed "url": row[schema["url"]], - "license": row[schema["license"]].lower(), + "foreign_landing_url": row[schema["foreign_landing_url"]], + "creator_url": row[schema["creator_url"]], "license_version": row[schema["license_version"]], "license_url": Media.get_license_url(meta), - "provider": row[schema["provider"]], - "source": row[schema["source"]], - "category": category, - "created_on": row[schema["created_on"]], - "tags": Media.parse_detailed_tags(row[schema["tags"]]), - "mature": Media.get_maturity(meta, row[schema["mature"]]), - "standardized_popularity": popularity, } @staticmethod @@ -230,28 +242,18 @@ class Index: @staticmethod def database_row_to_elasticsearch_doc(row, schema): extension = Image.get_extension(row[schema["url"]]) - height = row[schema["height"]] width = row[schema["width"]] aspect_ratio = Image.get_aspect_ratio(height, width) size = Image.get_size(height, width) - - meta = row[schema["meta_data"]] - provider = row[schema["provider"]] - authority_boost = Image.get_authority_boost(meta, provider) - attrs = Image.get_instance_attrs(row, schema) - attrs["category"] = attrs["category"] - popularity = attrs["standardized_popularity"] return Image( - thumbnail=row[schema["thumbnail"]], aspect_ratio=aspect_ratio, extension=extension, size=size, - authority_boost=authority_boost, - max_boost=max(popularity or 1, authority_boost or 1), - min_boost=min(popularity or 1, authority_boost or 1), + # Extra fields, not indexed + thumbnail=row[schema["thumbnail"]], **attrs, ) @@ -319,27 +321,18 @@ def database_row_to_elasticsearch_doc(row, schema): alt_files = row[schema["alt_files"]] filetype = row[schema["filetype"]] extension = Audio.get_extensions(filetype, alt_files) - - meta = row[schema["meta_data"]] - provider = row[schema["provider"]] - authority_boost = Audio.get_authority_boost(meta, provider) - attrs = Audio.get_instance_attrs(row, schema) - popularity = attrs["standardized_popularity"] - length = Audio.get_length(row[schema["duration"]]) return Audio( + length=length, + filetype=filetype, + extension=extension, + # Extra fields, not indexed bit_rate=row[schema["bit_rate"]], sample_rate=row[schema["sample_rate"]], genres=row[schema["genres"]], duration=row[schema["duration"]], - length=length, - filetype=filetype, - extension=extension, - authority_boost=authority_boost, - max_boost=max(popularity or 1, authority_boost or 1), - min_boost=min(popularity or 1, authority_boost or 1), **attrs, ) diff --git a/ingestion_server/ingestion_server/es_mapping.py b/ingestion_server/ingestion_server/es_mapping.py index b780088a8e7..c70e4c1a4fd 100644 --- a/ingestion_server/ingestion_server/es_mapping.py +++ b/ingestion_server/ingestion_server/es_mapping.py @@ -1,13 +1,22 @@ -def index_settings(table_name): +from ingestion_server.constants.media_types import AUDIO_TYPE, IMAGE_TYPE, MediaType + + +def index_settings(media_type: MediaType): """ Return the Elasticsearch mapping for a given table in the database. - :param table_name: The name of the table in the upstream database. - :return: + :param media_type: The name of the table in the upstream database. + :return: the settings for the ES mapping """ + + number_of_shards: dict[MediaType, int] = { + IMAGE_TYPE: 18, + AUDIO_TYPE: 1, + } + settings = { "index": { - "number_of_shards": 18, + "number_of_shards": number_of_shards[media_type], "number_of_replicas": 0, "refresh_interval": "-1", }, @@ -51,109 +60,89 @@ def index_settings(table_name): }, } common_mappings = { + "dynamic": False, # extra fields are stored in ``_source`` but not indexed "properties": { "id": {"type": "long"}, + "created_on": {"type": "date"}, + "mature": {"type": "boolean"}, + # Keyword fields "identifier": { - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - "type": "text", + # TODO: Remove subfield when API is updated + "fields": {"keyword": {"type": "keyword"}}, + "type": "keyword", }, + "extension": {"type": "keyword"}, + "license": { + # TODO: Remove subfield when API is updated + "fields": {"keyword": {"type": "keyword"}}, + "type": "keyword", + }, + "provider": {"type": "keyword"}, + "source": { + # TODO: Remove subfield when API is updated + "fields": {"keyword": {"type": "keyword"}}, + "type": "keyword", + }, + "filetype": {"type": "keyword"}, + "category": {"type": "keyword"}, + # Text-based fields "title": { "type": "text", + "analyzer": "custom_english", "similarity": "boolean", "fields": { "keyword": {"type": "keyword", "ignore_above": 256}, "raw": {"type": "text", "index": True}, }, - "analyzer": "custom_english", - }, - "foreign_landing_url": { - "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}}, - "type": "text", }, "description": { - "fields": { - "keyword": {"type": "keyword", "similarity": "boolean"}, - "raw": {"type": "text", "index": True}, - }, "type": "text", "analyzer": "custom_english", + "similarity": "boolean", + "fields": {"raw": {"type": "text", "index": True}}, }, "creator": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, }, - "url": { - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - "type": "text", - }, - "extension": { - "fields": {"keyword": {"ignore_above": 8, "type": "keyword"}}, - "type": "text", - }, - "license": { - "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}}, - "type": "text", - }, - "license_version": { - "type": "text", - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - }, - "license_url": { - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - "type": "text", - }, - "provider": { - "type": "text", - "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}, - }, - "source": { - "fields": {"keyword": {"ignore_above": 256, "type": "keyword"}}, - "type": "text", + # Rank feature fields + "standardized_popularity": {"type": "rank_feature"}, + "authority_boost": {"type": "rank_feature"}, + "authority_penalty": { + "type": "rank_feature", + "positive_score_impact": False, }, - "filetype": {"type": "keyword"}, - "created_on": {"type": "date"}, + "max_boost": {"type": "rank_feature"}, + "min_boost": {"type": "rank_feature"}, + # Nested fields "tags": { "properties": { "accuracy": {"type": "float"}, + # Text-based fields "name": { "type": "text", + "analyzer": "custom_english", "fields": { "keyword": {"type": "keyword", "ignore_above": 256}, "raw": {"type": "text", "index": True}, }, - "analyzer": "custom_english", }, } }, - "mature": {"type": "boolean"}, - "standardized_popularity": {"type": "rank_feature"}, - "authority_boost": {"type": "rank_feature"}, - "authority_penalty": { - "type": "rank_feature", - "positive_score_impact": False, - }, - "max_boost": {"type": "rank_feature"}, - "min_boost": {"type": "rank_feature"}, - "category": {"type": "keyword"}, - } + }, } media_properties = { "image": { - "aspect_ratio": { - "fields": {"keyword": {"type": "keyword"}}, - "type": "text", - }, - "size": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"}, + # Keyword fields + "aspect_ratio": {"type": "keyword"}, + "size": {"type": "keyword"}, }, "audio": { - "bit_rate": {"type": "integer"}, - "sample_rate": {"type": "integer"}, - "genres": {"fields": {"keyword": {"type": "keyword"}}, "type": "text"}, - "duration": {"type": "integer"}, + # Keyword fields "length": {"type": "keyword"}, }, } media_mappings = common_mappings.copy() - media_mappings["properties"].update(media_properties[table_name]) + media_mappings["properties"].update(media_properties[media_type]) result = {"settings": settings.copy(), "mappings": media_mappings} return result