From ccaff5ab65dbbc31b1a40954d03a4c814859f2da Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Sun, 22 Aug 2021 18:04:06 +0200 Subject: [PATCH] Automatically use utf8mb3_general_ci collation for mysql (#17729) The index size is too big in case utf8mb4 is used as encoding for MySQL database. We already had `sql_engine_collation_for_ids` configuration to allow the id fields to use different collation, but the user had to set it up manually in case of a failure to create a db and it was not obvious, not discoverable and rather clumsy. Since this is really only a problem with MySQL the easy solution is to force this parameter to utf8mb3_general_ci for all mysql databases. It has no negative consequences, really as all relevant IDs are ASCII anyway. Related: #17603 --- airflow/config_templates/config.yml | 7 ++-- airflow/config_templates/default_airflow.cfg | 7 ++-- airflow/models/base.py | 13 ++++++ docs/apache-airflow/howto/set-up-database.rst | 8 +++- scripts/ci/docker-compose/backend-mysql.yml | 1 - scripts/in_container/entrypoint_ci.sh | 3 -- tests/sensors/test_base.py | 41 +++++++++++++++++++ 7 files changed, 68 insertions(+), 12 deletions(-) diff --git a/airflow/config_templates/config.yml b/airflow/config_templates/config.yml index f4a26c74c9b4f..5214f3d8bbc69 100644 --- a/airflow/config_templates/config.yml +++ b/airflow/config_templates/config.yml @@ -81,9 +81,10 @@ - name: sql_engine_collation_for_ids description: | Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding. - This is particularly useful in case of mysql with utf8mb4 encoding because - primary keys for XCom table has too big size and ``sql_engine_collation_for_ids`` should - be set to ``utf8mb3_general_ci``. + By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb`` + the default is ``utf8mb3_general_ci`` so that the index sizes of our index keys will not exceed + the maximum size of allowed index when collation is set to ``utf8mb4`` variant + (see https://github.com/apache/airflow/pull/17603#issuecomment-901121618). version_added: 2.0.0 type: string example: ~ diff --git a/airflow/config_templates/default_airflow.cfg b/airflow/config_templates/default_airflow.cfg index c53213b6452df..a44b86fbd7f19 100644 --- a/airflow/config_templates/default_airflow.cfg +++ b/airflow/config_templates/default_airflow.cfg @@ -63,9 +63,10 @@ sql_alchemy_conn = sqlite:///{AIRFLOW_HOME}/airflow.db sql_engine_encoding = utf-8 # Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding. -# This is particularly useful in case of mysql with utf8mb4 encoding because -# primary keys for XCom table has too big size and ``sql_engine_collation_for_ids`` should -# be set to ``utf8mb3_general_ci``. +# By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb`` +# the default is ``utf8mb3_general_ci`` so that the index sizes of our index keys will not exceed +# the maximum size of allowed index when collation is set to ``utf8mb4`` variant +# (see https://github.com/apache/airflow/pull/17603#issuecomment-901121618). # sql_engine_collation_for_ids = # If SqlAlchemy should pool database connections. diff --git a/airflow/models/base.py b/airflow/models/base.py index 55584e4f2a27b..03221246e9ad8 100644 --- a/airflow/models/base.py +++ b/airflow/models/base.py @@ -44,6 +44,19 @@ def get_id_collation_args(): if collation: return {'collation': collation} else: + # Automatically use utf8mb3_general_ci collation for mysql + # This is backwards-compatible. All our IDS are ASCII anyway so even if + # we migrate from previously installed database with different collation and we end up mixture of + # COLLATIONS, it's not a problem whatsoever (and we keep it small enough so that our indexes + # for MYSQL will not exceed the maximum index size. + # + # See https://github.com/apache/airflow/pull/17603#issuecomment-901121618. + # + # We cannot use session/dialect as at this point we are trying to determine the right connection + # parameters, so we use the connection + conn = conf.get('core', 'sql_alchemy_conn', fallback='') + if conn.startswith('mysql') or conn.startswith("mariadb"): + return {'collation': 'utf8mb3_general_ci'} return {} diff --git a/docs/apache-airflow/howto/set-up-database.rst b/docs/apache-airflow/howto/set-up-database.rst index 30ea6a20e4a31..02395d57d22f5 100644 --- a/docs/apache-airflow/howto/set-up-database.rst +++ b/docs/apache-airflow/howto/set-up-database.rst @@ -154,14 +154,18 @@ In the example below, a database ``airflow_db`` and user with username ``airflo .. code-block:: sql - CREATE DATABASE airflow_db CHARACTER SET utf8 COLLATE utf8_general_ci; + CREATE DATABASE airflow_db CHARACTER SET utf8 COLLATE utf8mb4_unicode_ci; CREATE USER 'airflow_user' IDENTIFIED BY 'airflow_pass'; GRANT ALL PRIVILEGES ON airflow_db.* TO 'airflow_user'; .. note:: - The database must use a UTF-8 character set + The database must use a UTF-8 character set. A small caveat that you must be aware of is that utf8 in newer versions of MySQL is really utf8mb4 which + causes Airflow indexes to grow too large (see https://github.com/apache/airflow/pull/17603#issuecomment-901121618). Therefore as of Airflow 2.2 + all MySQL databases have ``sql_engine_collation_for_ids`` set automatically to ``utf8mb3_general_ci`` (unless you override it). This might + lead to a mixture of collation ids for id fields in Airflow Database, but it has no negative consequences since all relevant IDs in Airflow use + ASCII characters only. We rely on more strict ANSI SQL settings for MySQL in order to have sane defaults. Make sure to have specified ``explicit_defaults_for_timestamp=1`` option under ``[mysqld]`` section diff --git a/scripts/ci/docker-compose/backend-mysql.yml b/scripts/ci/docker-compose/backend-mysql.yml index 6574dd7c02b2e..02f86bda89ee4 100644 --- a/scripts/ci/docker-compose/backend-mysql.yml +++ b/scripts/ci/docker-compose/backend-mysql.yml @@ -21,7 +21,6 @@ services: environment: - BACKEND=mysql - AIRFLOW__CORE__SQL_ALCHEMY_CONN=mysql://root@mysql/airflow?charset=utf8mb4 - - AIRFLOW__CORE__SQL_ENGINE_COLLATION_FOR_IDS=utf8mb3_general_ci - AIRFLOW__CELERY__RESULT_BACKEND=db+mysql://root@mysql/airflow?charset=utf8mb4 - AIRFLOW__CORE__EXECUTOR=LocalExecutor depends_on: diff --git a/scripts/in_container/entrypoint_ci.sh b/scripts/in_container/entrypoint_ci.sh index 0c0016354ee7a..766ea10597b84 100755 --- a/scripts/in_container/entrypoint_ci.sh +++ b/scripts/in_container/entrypoint_ci.sh @@ -39,9 +39,6 @@ echo echo "Airflow home: ${AIRFLOW_HOME}" echo "Airflow sources: ${AIRFLOW_SOURCES}" echo "Airflow core SQL connection: ${AIRFLOW__CORE__SQL_ALCHEMY_CONN:=}" -if [[ -n "${AIRFLOW__CORE__SQL_ENGINE_COLLATION_FOR_IDS=}" ]]; then - echo "Airflow collation for IDs: ${AIRFLOW__CORE__SQL_ENGINE_COLLATION_FOR_IDS}" -fi echo diff --git a/tests/sensors/test_base.py b/tests/sensors/test_base.py index dd3bf29a0ad49..a26bc94963f2e 100644 --- a/tests/sensors/test_base.py +++ b/tests/sensors/test_base.py @@ -19,6 +19,7 @@ import unittest from datetime import timedelta +from unittest import mock from unittest.mock import Mock, patch import pytest @@ -26,6 +27,7 @@ from airflow.exceptions import AirflowException, AirflowRescheduleException, AirflowSensorTimeout from airflow.models import DagBag, TaskInstance, TaskReschedule +from airflow.models.base import get_id_collation_args from airflow.models.dag import DAG from airflow.operators.dummy import DummyOperator from airflow.sensors.base import BaseSensorOperator, poke_mode_only @@ -655,3 +657,42 @@ def test_poke_mode_only_bad_poke(self): sensor = DummyPokeOnlySensor(task_id='foo', mode='poke', poke_changes_mode=True, dag=self.dag) with pytest.raises(ValueError): sensor.poke({}) + + +class TestCollation(unittest.TestCase): + @mock.patch.dict( + 'os.environ', + AIRFLOW__CORE__SQL_ALCHEMY_CONN='postgres://host/the_database', + ) + def test_collation_empty_on_non_mysql(self): + assert {} == get_id_collation_args() + + @mock.patch.dict( + 'os.environ', + AIRFLOW__CORE__SQL_ALCHEMY_CONN='mysql://host/the_database', + ) + def test_collation_set_on_mysql(self): + assert {"collation": "utf8mb3_general_ci"} == get_id_collation_args() + + @mock.patch.dict( + 'os.environ', + AIRFLOW__CORE__SQL_ALCHEMY_CONN='mysql+pymsql://host/the_database', + ) + def test_collation_set_on_mysql_with_pymsql(self): + assert {"collation": "utf8mb3_general_ci"} == get_id_collation_args() + + @mock.patch.dict( + 'os.environ', + AIRFLOW__CORE__SQL_ALCHEMY_CONN='mysql://host/the_database', + AIRFLOW__CORE__SQL_ENGINE_COLLATION_FOR_IDS='ascii', + ) + def test_collation_override_on_non_mysql(self): + assert {"collation": "ascii"} == get_id_collation_args() + + @mock.patch.dict( + 'os.environ', + AIRFLOW__CORE__SQL_ALCHEMY_CONN='postgres://host/the_database', + AIRFLOW__CORE__SQL_ENGINE_COLLATION_FOR_IDS='ascii', + ) + def test_collation_override_on_mysql(self): + assert {"collation": "ascii"} == get_id_collation_args()