From 5b45a78dca284e504280964951f079fca1866226 Mon Sep 17 00:00:00 2001 From: Daniel Standish <15932138+dstandish@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:34:44 -0800 Subject: [PATCH] Add docs for `db upgrade` / `db downgrade` (#21879) --- airflow/cli/cli_parser.py | 20 ++++++--- docs/apache-airflow/best-practices.rst | 38 ++++++++++++++++ docs/apache-airflow/usage-cli.rst | 60 ++++++++++++++++++++++++++ docs/spelling_wordlist.txt | 2 + 4 files changed, 114 insertions(+), 6 deletions(-) diff --git a/airflow/cli/cli_parser.py b/airflow/cli/cli_parser.py index f621052a2fb41..fa1163152ce46 100644 --- a/airflow/cli/cli_parser.py +++ b/airflow/cli/cli_parser.py @@ -526,26 +526,27 @@ def string_list_type(val): "-n", "--version", ), - help="The airflow version to downgrade to", + help="The airflow version to downgrade to. Note: must provide either `--revision` or `--version`.", ) ARG_DB_FROM_VERSION = Arg( ("--from-version",), - help="(Optional) if generating sql, may supply a _from_ version", + help="(Optional) If generating sql, may supply a _from_ version", ) ARG_DB_REVISION = Arg( ( "-r", "--revision", ), - help="The airflow revision to downgrade to", + help="The airflow revision to downgrade to. Note: must provide either `--revision` or `--version`.", ) ARG_DB_FROM_REVISION = Arg( ("--from-revision",), - help="(Optional) if generating sql, may supply a _from_ revision", + help="(Optional) If generating sql, may supply a _from_ revision", ) ARG_DB_SQL = Arg( ("-s", "--sql-only"), - help="Don't actually run migrations; just print out sql scripts for offline migration.", + help="Don't actually run migrations; just print out sql scripts for offline migration. " + "Required if using either `--from-version` or `--from-version`.", action="store_true", default=False, ) @@ -1362,7 +1363,14 @@ class GroupCommand(NamedTuple): ), ActionCommand( name='downgrade', - help="Downgrade the schema of the metadata database", + help="Downgrade the schema of the metadata database.", + description=( + "Downgrade the schema of the metadata database. " + "You must provide either `--revision` or `--version`. " + "To print but not execute commands, use option `--sql-only`. " + "If using options `--from-revision` or `--from-version`, you must also use `--sql-only`, " + "because if actually *running* migrations, we should only migrate from the *current* revision." + ), func=lazy_load_command('airflow.cli.commands.db_command.downgrade'), args=( ARG_DB_REVISION, diff --git a/docs/apache-airflow/best-practices.rst b/docs/apache-airflow/best-practices.rst index 1440421e7cf38..9f489a2267886 100644 --- a/docs/apache-airflow/best-practices.rst +++ b/docs/apache-airflow/best-practices.rst @@ -577,3 +577,41 @@ For connection, use :envvar:`AIRFLOW_CONN_{CONN_ID}`. conn_uri = conn.get_uri() with mock.patch.dict("os.environ", AIRFLOW_CONN_MY_CONN=conn_uri): assert "cat" == Connection.get("my_conn").login + +Metadata DB maintenance +----------------------- + +Over time, the metadata database will increase its storage footprint as more DAG and task runs and event logs accumulate. + +You can use the Airflow CLI to purge old data with the command ``airflow db clean``. + +See :ref:`db clean usage` for more details. + +Upgrades and downgrades +----------------------- + +Backup your database +^^^^^^^^^^^^^^^^^^^^ + +It's always a wise idea to backup the metadata database before undertaking any operation modifying the database. + +Disable the scheduler +^^^^^^^^^^^^^^^^^^^^^ + +You might consider disabling the Airflow cluster while you perform such maintenance. + +One way to do so would be to set the param ``[scheduler] > use_job_schedule`` to ``False`` and wait for any running DAGs to complete; after this no new DAG runs will be created unless externally triggered. + +A *better* way (though it's a bit more manual) is to use the ``dags pause`` command. You'll need to keep track of the DAGs that are paused before you begin this operation so that you know which ones to unpause after maintenance is complete. First run ``airflow dags list`` and store the list of unpaused DAGs. Then use this same list to run both ``dags pause`` for each DAG prior to maintenance, and ``dags unpause`` after. A benefit of this is you can try un-pausing just one or two DAGs (perhaps dedicated :ref:`test dags `) after the upgrade to make sure things are working before turning everything back on. + +.. _integration-test-dags: + +Add "integration test" DAGs +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It can be helpful to add a couple "integration test" DAGs that use all the common services in your ecosystem (e.g. S3, Snowflake, Vault) but with dummy resources or "dev" accounts. These test DAGs can be the ones you turn on *first* after an upgrade, because if they fail, it doesn't matter and you can revert to your backup without negative consequences. However, if they succeed, they should prove that your cluster is able to run tasks with the libraries and services that you need to use. + +Prune data before upgrading +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some database migrations can be time-consuming. If your metadata database is very large, consider pruning some of the old data with the :ref:`db clean` command prior to performing the upgrade. *Use with caution.* diff --git a/docs/apache-airflow/usage-cli.rst b/docs/apache-airflow/usage-cli.rst index 393eb4c4b5edb..fc4439988646a 100644 --- a/docs/apache-airflow/usage-cli.rst +++ b/docs/apache-airflow/usage-cli.rst @@ -199,3 +199,63 @@ Both ``json`` and ``yaml`` formats make it easier to manipulate the data using c "sd": "2020-11-29T14:53:56.931243+00:00", "ed": "2020-11-29T14:53:57.126306+00:00" } + +.. _cli-db-clean: + +Purge history from metadata database +------------------------------------ + +.. note:: + + It's strongly recommended that you backup the metadata database before running the ``db clean`` command. + +The ``db clean`` command works by deleting from each table the records older than the provided ``--clean-before-timestamp``. + +You can optionally provide a list of tables to perform deletes on. If no list of tables is supplied, all tables will be included. + +You can use the ``--dry-run`` option to print the row counts in the primary tables to be cleaned. + +Beware cascading deletes +^^^^^^^^^^^^^^^^^^^^^^^^ + +Keep in mind that some tables have foreign key relationships defined with ``ON DELETE CASCADE`` so deletes in one table may trigger deletes in others. For example, the ``task_instance`` table keys to the ``dag_run`` table, so if a DagRun record is deleted, all of its associated task instances will also be deleted. + +Special handling for DAG runs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Commonly, Airflow determines which DagRun to run next by looking up the latest DagRun. If you delete all DAG runs, Airflow may schedule an old DAG run that was already completed, e.g. if you have set ``catchup=True``. So the ``db clean`` command will preserve the latest non-manually-triggered DAG run to preserve continuity in scheduling. + +Considerations for backfillable DAGs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Not all DAGs are designed for use with Airflow's backfill command. But for those which are, special care is warranted. If you delete DAG runs, and if you run backfill over a range of dates that includes the deleted DAG runs, those runs will be recreated and run again. For this reason, if you have DAGs that fall into this category you may want to refrain from deleting DAG runs and only clean other large tables such as task instance and log etc. + +.. _cli-db-upgrade: + +Upgrading Airflow +----------------- + +Run ``airflow db upgrade --help`` for usage details. + +Running migrations manually +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If desired, you can generate the sql statements for an upgrade and apply each upgrade migration manually, one at a time. To do so you may use either the ``--range`` (for Airflow version) or ``--revision-range`` (for Alembic revision) option with ``db upgrade``. Do *not* skip running the Alembic revision id update commands; this is how Airflow will know where you are upgrading from the next time you need to. See :doc:`/migrations-ref` for a mapping between revision and version. + + +.. _cli-db-downgrade: + +Downgrading Airflow +------------------- + +.. note:: + + It's recommended that you backup your database before running ``db downgrade`` or any other database operation. + +You can downgrade to a particular Airflow version with the ``db downgrade`` command. Alternatively you may provide an Alembic revision id to downgrade to. + +If you want to preview the commands but not execute them, use option ``--sql-only``. + +Options ``--from-revision`` and ``--from-version`` may only be used in conjunction with the ``--sql-only`` option, because when actually *running* migrations we should always downgrade from current revision. + +For a mapping between Airflow version and Alembic revision see :doc:`/migrations-ref`. diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index da0c8a519549f..2f5ddc0993c0d 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -504,6 +504,7 @@ backcompat backend backends backfill +backfillable backfilled backfilling backfills @@ -1441,6 +1442,7 @@ unix unmappable unmapped unpause +unpaused unpausing unpredicted unqueued