diff --git a/catalog/justfile b/catalog/justfile index 8e261fde859..bb4f9446e36 100644 --- a/catalog/justfile +++ b/catalog/justfile @@ -142,6 +142,27 @@ generate-dag-docs fail_on_diff="false": fi fi + +# Generate the DAG documentation +generate-media-props fail_on_diff="true": + #!/bin/bash + set -e + python utilities/media_props_gen/generate_media_properties.py \&\& chmod 666 utilities/media_props_gen/media_properties.md + # Move the file to the documentation folder + mv utilities/media_props_gen/media_properties.md ../documentation/meta/media_properties.md + echo -n "Running linting..." + # Linting step afterwards is necessary since the generated output differs greatly from what prettier expects + just ../lint prettier ../documentation/meta/media_properties.md &>/dev/null || true + echo "Done!" + if {{ fail_on_diff }}; then + set +e + git diff --exit-code ../documentation/meta/media_properties.md + if [ $? -ne 0 ]; then + printf "\n\n\e[31m!! Changes found in Media properties documentation, please run 'just generate-media-props' locally and commit difference !!\n\n" + exit 1 + fi + fi + # Generate files for a new provider add-provider provider_name endpoint +media_types="image": python3 templates/create_provider_ingester.py "{{ provider_name }}" "{{ endpoint }}" -m {{ media_types }} diff --git a/catalog/utilities/media_props_gen/__init__.py b/catalog/utilities/media_props_gen/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/catalog/utilities/media_props_gen/column_parser.py b/catalog/utilities/media_props_gen/column_parser.py new file mode 100644 index 00000000000..fe8358dec18 --- /dev/null +++ b/catalog/utilities/media_props_gen/column_parser.py @@ -0,0 +1,125 @@ +import ast +import copy +from pathlib import Path + + +COLUMNS_PATH = Path(__file__).parents[2] / "dags" / "common" / "storage" / "columns.py" + +COLUMNS_URL = "https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py" # noqa: E501 + +COLUMN = { + "python_type": None, + "name": None, + "db_name": None, + "nullable": None, + "required": False, + "upsert_strategy": "newest_non_null", + "base_column": None, +} + +COLUMN_PROPS = COLUMN.keys() + + +def file_to_ast(file_name) -> ast.Module: + with open(file_name) as f: + file_contents = f.read() + return ast.parse(file_contents) + + +CODE = file_to_ast(COLUMNS_PATH) + + +def format_python_column( + python_column: dict[str, any], + python_column_lines: dict[str, tuple[int, int]], +) -> str: + """ + Format the Python column properties dictionary to a string that can be + used in the markdown file. + """ + col_type = python_column.pop("python_type") + start, end = python_column_lines[col_type] + python_column_string = f"[{col_type}]({COLUMNS_URL}#L{start}-L{end}) (`" + + col_name = python_column.pop("name") + column_db_name = python_column.pop("db_name") + if column_db_name and col_name != column_db_name: + python_column_string += f'name="{col_name}", ' + + python_column_string += ", ".join( + [f"{k}={v}" for k, v in python_column.items() if v is not None] + ) + + return f"{python_column_string}`)" + + +def get_python_column_types() -> dict[str, tuple[int, int]]: + """ + Extract all types of columns with their line numbers for hyperlinks. + Sample output: `StringColumn: (3, 5)`` + """ + return { + item.name: (item.lineno, item.end_lineno) + for item in ast.iter_child_nodes(CODE) + if isinstance(item, ast.ClassDef) and item.name.endswith("Column") + } + + +def parse_col_argument_value(item): + """ + Return `attr` for Attribute value (upsert strategies), `func.id` for Call value (base_column), + and `value` for Constant values such as `true`. + We don't save the List type used for sql_args. + """ + # Upsert strategy + if isinstance(item, ast.Attribute) and isinstance(item.value, ast.Name): + return item.attr + # Base column + elif isinstance(item, ast.Call) and isinstance(item.func, ast.Name): + return item.func.id + elif isinstance(item, ast.Constant): + return item.value + return item.value + + +def parse_python_columns() -> dict[str, any]: + """ + Parse columns.py to a dictionary with the column's `db_name` as a key, + and the string describing Python column as a value. + Example output: + "height": "[IntegerColumn](/link/to/column/type/definition/)" + + "(`name="height", nullable=True, required=False, upsert_strategy=newest_non_null`)" + """ + python_column_lines = get_python_column_types() + + # Extracts all the assignments of the form `column_name = Column(...)` + cols: list[ast.Call] = [ + item.value + for item in ast.iter_child_nodes(CODE) + if isinstance(item, ast.Assign) + and isinstance(item.value, ast.Call) + and isinstance(item.value.func, ast.Name) + and item.value.func.id.endswith("Column") + ] + + columns = {} + for col in cols: + parsed_column = copy.copy(COLUMN) | { + col.arg: parse_col_argument_value(col.value) + for col in col.keywords + if col.arg in COLUMN_PROPS + } + parsed_column["python_type"] = col.func.id + + # If required is true, then the media item is discarded if the column is null. + # This mean that the column cannot have `None` as a value. + if parsed_column["nullable"] is None: + parsed_column["nullable"] = ( + True + if parsed_column["required"] is None + else not parsed_column["required"] + ) + db_name = parsed_column.get("db_name") or parsed_column["name"] + columns[db_name] = format_python_column(parsed_column, python_column_lines) + + return columns diff --git a/catalog/utilities/media_props_gen/db.py b/catalog/utilities/media_props_gen/db.py new file mode 100644 index 00000000000..836d524851c --- /dev/null +++ b/catalog/utilities/media_props_gen/db.py @@ -0,0 +1,81 @@ +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal + + +LOCAL_POSTGRES_FOLDER = Path(__file__).parents[3] / "docker" / "upstream_db" +SQL_PATH = { + "image": LOCAL_POSTGRES_FOLDER / "0003_openledger_image_schema.sql", + "audio": LOCAL_POSTGRES_FOLDER / "0006_openledger_audio_schema.sql", +} +sql_types = [ + "integer", + "boolean", + "uuid", + "double precision", + "jsonb", + "timestamp with time zone", + "character varying", +] +sql_type_regex = re.compile(f"({'|'.join(sql_types)})") +MediaType = Literal["audio", "image"] + + +@dataclass +class FieldSqlInfo: + nullable: bool + datatype: str + constraint: str + + +def create_db_props_dict( + media_type: MediaType, +) -> dict[Any, Any] | dict[Any, dict[str, FieldSqlInfo]]: + """ + Parse the DDL for a media type and returns a list of field + sql definitions. + """ + + create_table_regex = re.compile(r"CREATE\s+TABLE\s+\w+\.(\w+)\s+\(([\s\S]*?)\);") + sql_path = SQL_PATH[media_type] + + with open(sql_path) as f: + contents = f.read() + table_description_matches = create_table_regex.search(contents) + if not table_description_matches: + print(f"Could not find table description for {media_type} in {sql_path}") + return {} + table_name = table_description_matches.group(1) + if table_name != media_type: + print(f"Table name {table_name} does not match media type {media_type}") + return {} + field_descriptions = [ + field.strip() + for field in table_description_matches.group(2).split("\n") + if field.strip() + ] + fields = {} + for field in field_descriptions: + field_name = field.split(" ")[0] + # False if "not null" in field.lower() else True + field_constraint = "" + try: + field_type = sql_type_regex.search(field).group(1) + if field_type == "character varying": + char_limit = field.split("(")[1].split(")")[0] + field_constraint = f"({char_limit})" + + if "[]" in field: + field_type = f"array of {field_type}" + except AttributeError: + raise ValueError(f"Could not find type for field {field_name} in {field}") + + fields[field_name] = { + "sql": FieldSqlInfo( + nullable="NOT NULL" not in field, + datatype=field_type, + constraint=field_constraint, + ) + } + return fields diff --git a/catalog/utilities/media_props_gen/generate_media_properties.py b/catalog/utilities/media_props_gen/generate_media_properties.py new file mode 100644 index 00000000000..0505af31820 --- /dev/null +++ b/catalog/utilities/media_props_gen/generate_media_properties.py @@ -0,0 +1,137 @@ +"""Automatic media properties generation.""" + +import logging +from dataclasses import dataclass +from pathlib import Path + +from column_parser import parse_python_columns +from db import MediaType, create_db_props_dict +from md import Md + + +log = logging.getLogger(__name__) +# Silence noisy modules +logging.getLogger("common.storage.media").setLevel(logging.WARNING) + +# Constants +PARENT = Path(__file__).parent +DOC_MD_PATH = PARENT / "media_properties.md" +SOURCE_MD_PATH = PARENT / "media_props.md" + +PREAMBLE = open(Path(__file__).parent / "preamble.md").read() + +MEDIA_TYPES: list[MediaType] = ["audio", "image"] + + +@dataclass +class FieldInfo: + name: str + nullable: bool + datatype: str + constraint: str + python_column: str = "" + + +def generate_media_properties() -> dict: + """ + Generate a dictionary documenting each property of the media items. + For each property, return the database field and the Python object shape. + """ + media_props = {} + python_columns = parse_python_columns() + + for media_type in MEDIA_TYPES: + media_props[media_type] = create_db_props_dict(media_type) + + # Add the python column properties to the media properties dictionary + for prop in media_props[media_type].keys(): + media_props[media_type][prop]["python_column"] = python_columns.get( + prop, "" + ) + + return media_props + + +def generate_db_props_string(field_name: str, field: dict) -> tuple[str, str]: + field_sql = field["sql"] + + constraint = f"{' '+field_sql.constraint if field_sql.constraint else ''}" + nullable = f"{'nullable' if field_sql.nullable else 'non-nullable'}" + props_string = f"{field_sql.datatype}{constraint}, {nullable}" + + return f"[`{field_name}`](#{field_name})", props_string + + +def generate_media_props_table(media_properties) -> str: + """Generate the markdown table with media properties.""" + + # Convert the list of FieldInfo objects to a md table + table = "| Name | DB Field | Python Column |\n" + table += "| --- | --- | --- |\n" + for field_name, field in media_properties.items(): + name, db_properties = generate_db_props_string(field_name, field) + + table += ( + f"| {name} | {db_properties} | " f"{field.get('python_column', '')} |\n" + ) + return table + + +def generate_long_form_doc(markdown_descriptions: dict, media_properties: dict) -> str: + """ + Generate the long-form markdown documentation for each media property. + Uses the markdown descriptions from the `media_props.md` source file. + Also uses `media_properties` dictionary to set which media types have + the specific properties. + """ + media_docs = "" + for prop, description in markdown_descriptions.items(): + prop_heading = f"{Md.heading(3, prop)}" + + media_types = [ + f"`{media_type}`" + for media_type, value in media_properties.items() + if prop in value.keys() + ] + prop_heading += f"_Media Types_: {', '.join(media_types)}\n\n" + + prop_doc = "".join( + [f"{Md.heading(4, k)}{Md.line(v)}" for k, v in description.items()] + ) + media_docs += prop_heading + prop_doc + + return media_docs + + +def generate_markdown_doc() -> str: + """ + Parse the media property descriptions from the source code and `media_props.md` + Generate the tables with media properties database column and + Python objects characteristics, and a long-form documentation for each property. + """ + media_properties = generate_media_properties() + markdown_descriptions = Md.parse(SOURCE_MD_PATH) + + image_table = generate_media_props_table(media_properties["image"]) + audio_table = generate_media_props_table(media_properties["audio"]) + + long_form_doc = generate_long_form_doc(markdown_descriptions, media_properties) + + media_props_doc = f""" +{PREAMBLE} +{Md.heading(2, "Image Properties")}{image_table} +{Md.heading(2, "Audio Properties")}{audio_table} +{Md.heading(2, "Media Property Descriptions")}{long_form_doc} +""".strip() + return media_props_doc + + +def write_media_props_doc(path: Path = DOC_MD_PATH) -> None: + """Generate the DAG documentation and write it to a file.""" + doc_text = generate_markdown_doc() + log.info(f"Writing DAG doc to {path}") + path.write_text(doc_text) + + +if __name__ == "__main__": + write_media_props_doc() diff --git a/catalog/utilities/media_props_gen/md.py b/catalog/utilities/media_props_gen/md.py new file mode 100644 index 00000000000..8010fb01114 --- /dev/null +++ b/catalog/utilities/media_props_gen/md.py @@ -0,0 +1,41 @@ +from pathlib import Path + + +class Md: + @staticmethod + def heading(level: int, text: str) -> str: + """Add a heading to a markdown string.""" + return f"{'#' * level} {text}\n" + + @staticmethod + def line(text: str) -> str: + """Add a line to a markdown string.""" + return f"{text}\n" + + @staticmethod + def parse(file_name: Path) -> dict[str, dict[str, str]]: + """ + Parse the markdown documentation file and return a dictionary with the + field name as key and the description as value. + """ + with open(file_name) as f: + contents = [line for line in f.readlines() if line.strip()] + current_field = "" + properties = {} + prop = "" + value = {} + for i, line in enumerate(contents): + if line.startswith("# "): + if current_field and value: + properties[current_field] = value + current_field = line.replace("# ", "").strip() + value = {} + continue + elif line.startswith("## "): + prop = line.replace("## ", "").strip() + value[prop] = "" + continue + else: + value[prop] += line + + return properties diff --git a/catalog/utilities/media_props_gen/media_props.md b/catalog/utilities/media_props_gen/media_props.md new file mode 100644 index 00000000000..9553720a4c3 --- /dev/null +++ b/catalog/utilities/media_props_gen/media_props.md @@ -0,0 +1,425 @@ +# identifier + +## Description + +The unique UUID identifier for the media item. + +## Object Shape + +UUID + +## Selection Criteria + +Created when the item is inserted into the main table. + +## Normalization and Validation + +## Existing Data Inconsistencies + +# created_on + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# updated_on + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# ingestion_type + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# provider + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# source + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# foreign_identifier + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# foreign_landing_url + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# url + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# thumbnail + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# width + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# height + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# filesize + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# license + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# license_version + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# creator + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# creator_url + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# title + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# meta_data + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# tags + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# watermarked + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# last_synced_with_source + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# removed_from_source + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# filetype + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# category + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# standardized_popularity + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# duration + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# bit_rate + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# sample_rate + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# genres + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# alt_files + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# audio_set + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# audio_set_foreign_identifier + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# set_position + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# alt_files + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies diff --git a/catalog/utilities/media_props_gen/postamble.md b/catalog/utilities/media_props_gen/postamble.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/catalog/utilities/media_props_gen/preamble.md b/catalog/utilities/media_props_gen/preamble.md new file mode 100644 index 00000000000..324049620bf --- /dev/null +++ b/catalog/utilities/media_props_gen/preamble.md @@ -0,0 +1,10 @@ +# Media Properties + +_This document is auto-generated from the source code in +[/catalog/utilities/media_props_gen/generate_media_properties.py](https://github.com/WordPress/openverse/blob/main/catalog/utilities/media_props_gen/generate_media_properties.py)._ + +This is a list of the media properties, with the descriptions of corresponding +database columns and Python objects that are used to store and retrieve media +data. The order of the properties corresponds to their order in the `image_view` +materialized view. Property names typically match those of the database columns, +except when noted otherwise in the Python column's name property. diff --git a/documentation/meta/index.md b/documentation/meta/index.md index 23d71cf4484..d3a55e6968f 100644 --- a/documentation/meta/index.md +++ b/documentation/meta/index.md @@ -11,4 +11,5 @@ decision_making/index documentation/index monitoring/index maintenance/index +media_properties ``` diff --git a/documentation/meta/media_properties.md b/documentation/meta/media_properties.md new file mode 100644 index 00000000000..3dfa2e09558 --- /dev/null +++ b/documentation/meta/media_properties.md @@ -0,0 +1,562 @@ +# Media Properties + +_This document is auto-generated from the source code in +[/catalog/utilities/media_props_gen/generate_media_properties.py](https://github.com/WordPress/openverse/blob/main/catalog/utilities/media_props_gen/generate_media_properties.py)._ + +This is a list of the media properties, with the descriptions of corresponding +database columns and Python objects that are used to store and retrieve media +data. The order of the properties corresponds to their order in the `image_view` +materialized view. Property names typically match those of the database columns, +except when noted otherwise in the Python column's name property. + +## Image Properties + +| Name | DB Field | Python Column | +| ----------------------------------------------------- | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`identifier`](#identifier) | uuid, nullable | [UUIDColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L500-L517) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`created_on`](#created_on) | timestamp with time zone, non-nullable | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (`nullable=False, required=True, upsert_strategy=no_change`) | +| [`updated_on`](#updated_on) | timestamp with time zone, non-nullable | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (`nullable=False, required=True, upsert_strategy=newest_non_null`) | +| [`ingestion_type`](#ingestion_type) | character varying (80), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`provider`](#provider) | character varying (80), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`source`](#source) | character varying (80), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`foreign_identifier`](#foreign_identifier) | character varying (3000), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=False, required=True, upsert_strategy=newest_non_null`) | +| [`foreign_landing_url`](#foreign_landing_url) | character varying (1000), nullable | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (`nullable=True, required=True, upsert_strategy=newest_non_null`) | +| [`url`](#url) | character varying (3000), non-nullable | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (`nullable=False, required=True, upsert_strategy=newest_non_null`) | +| [`thumbnail`](#thumbnail) | character varying (3000), nullable | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (`name="thumbnail_url", nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`width`](#width) | integer, nullable | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`height`](#height) | integer, nullable | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`filesize`](#filesize) | integer, nullable | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`license`](#license) | character varying (50), non-nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`name="license_", nullable=False, required=True, upsert_strategy=newest_non_null`) | +| [`license_version`](#license_version) | character varying (25), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=False, required=True, upsert_strategy=newest_non_null`) | +| [`creator`](#creator) | character varying (2000), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`creator_url`](#creator_url) | character varying (2000), nullable | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`title`](#title) | character varying (5000), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`meta_data`](#meta_data) | jsonb, nullable | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`tags`](#tags) | jsonb, nullable | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (`nullable=True, required=False, upsert_strategy=merge_jsonb_arrays`) | +| [`watermarked`](#watermarked) | boolean, nullable | [BooleanColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L340-L385) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`last_synced_with_source`](#last_synced_with_source) | timestamp with time zone, nullable | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`removed_from_source`](#removed_from_source) | boolean, non-nullable | [BooleanColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L340-L385) (`nullable=False, required=True, upsert_strategy=false`) | +| [`filetype`](#filetype) | character varying (5), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`category`](#category) | character varying (80), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`standardized_popularity`](#standardized_popularity) | double precision, nullable | [CalculatedColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L259-L337) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | + +## Audio Properties + +| Name | DB Field | Python Column | +| --------------------------------------------------------------- | ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`identifier`](#identifier) | uuid, nullable | [UUIDColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L500-L517) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`created_on`](#created_on) | timestamp with time zone, non-nullable | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (`nullable=False, required=True, upsert_strategy=no_change`) | +| [`updated_on`](#updated_on) | timestamp with time zone, non-nullable | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (`nullable=False, required=True, upsert_strategy=newest_non_null`) | +| [`ingestion_type`](#ingestion_type) | character varying (80), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`provider`](#provider) | character varying (80), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`source`](#source) | character varying (80), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`foreign_identifier`](#foreign_identifier) | character varying (3000), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=False, required=True, upsert_strategy=newest_non_null`) | +| [`foreign_landing_url`](#foreign_landing_url) | character varying (1000), nullable | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (`nullable=True, required=True, upsert_strategy=newest_non_null`) | +| [`url`](#url) | character varying (3000), non-nullable | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (`nullable=False, required=True, upsert_strategy=newest_non_null`) | +| [`thumbnail`](#thumbnail) | character varying (3000), nullable | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (`name="thumbnail_url", nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`filetype`](#filetype) | character varying (5), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`duration`](#duration) | integer, nullable | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`bit_rate`](#bit_rate) | integer, nullable | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`sample_rate`](#sample_rate) | integer, nullable | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`category`](#category) | character varying (80), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`genres`](#genres) | array of character varying (80), nullable | [ArrayColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L599-L651) (`nullable=True, required=False, upsert_strategy=newest_non_null, base_column=StringColumn`) | +| [`audio_set`](#audio_set) | jsonb, nullable | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`set_position`](#set_position) | integer, nullable | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`alt_files`](#alt_files) | jsonb, nullable | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (`nullable=True, required=False, upsert_strategy=merge_jsonb_arrays`) | +| [`filesize`](#filesize) | integer, nullable | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`license`](#license) | character varying (50), non-nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`name="license_", nullable=False, required=True, upsert_strategy=newest_non_null`) | +| [`license_version`](#license_version) | character varying (25), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=False, required=True, upsert_strategy=newest_non_null`) | +| [`creator`](#creator) | character varying (2000), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`creator_url`](#creator_url) | character varying (2000), nullable | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`title`](#title) | character varying (5000), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`meta_data`](#meta_data) | jsonb, nullable | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`tags`](#tags) | jsonb, nullable | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (`nullable=True, required=False, upsert_strategy=merge_jsonb_arrays`) | +| [`watermarked`](#watermarked) | boolean, nullable | [BooleanColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L340-L385) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`last_synced_with_source`](#last_synced_with_source) | timestamp with time zone, nullable | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`removed_from_source`](#removed_from_source) | boolean, non-nullable | [BooleanColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L340-L385) (`nullable=False, required=True, upsert_strategy=false`) | +| [`standardized_popularity`](#standardized_popularity) | double precision, nullable | [CalculatedColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L259-L337) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | +| [`audio_set_foreign_identifier`](#audio_set_foreign_identifier) | character varying (1000), nullable | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (`nullable=True, required=False, upsert_strategy=newest_non_null`) | + +## Media Property Descriptions + +### identifier + +_Media Types_: `audio`, `image` + +#### Description + +The unique UUID identifier for the media item. + +#### Object Shape + +UUID + +#### Selection Criteria + +Created when the item is inserted into the main table. + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### created_on + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### updated_on + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### ingestion_type + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### provider + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### source + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### foreign_identifier + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### foreign_landing_url + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### url + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### thumbnail + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### width + +_Media Types_: `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### height + +_Media Types_: `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### filesize + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### license + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### license_version + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### creator + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### creator_url + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### title + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### meta_data + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### tags + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### watermarked + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### last_synced_with_source + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### removed_from_source + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### filetype + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### category + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### standardized_popularity + +_Media Types_: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### duration + +_Media Types_: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### bit_rate + +_Media Types_: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### sample_rate + +_Media Types_: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### genres + +_Media Types_: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### alt_files + +_Media Types_: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### audio_set + +_Media Types_: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### audio_set_foreign_identifier + +_Media Types_: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### set_position + +_Media Types_: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies