From 353df22037bf0e95a01c47724ffed4947ac5607d Mon Sep 17 00:00:00 2001 From: pierrejeambrun Date: Sat, 1 Jul 2023 17:28:56 +0200 Subject: [PATCH 1/2] Use linear time regular expressions The standard regexp library can consume > O(n) in certain circumstances. The re2 library does not have this issue. --- .pre-commit-config.yaml | 8 ++ STATIC_CODE_CHECKS.rst | 2 + .../endpoints/provider_endpoint.py | 4 +- airflow/cli/commands/provider_command.py | 4 +- airflow/cli/commands/user_command.py | 4 +- airflow/config_templates/default_celery.py | 7 +- airflow/configuration.py | 23 ++-- airflow/decorators/base.py | 8 +- airflow/kubernetes/pod_generator.py | 4 +- .../kubernetes/pod_generator_deprecated.py | 5 +- airflow/metrics/validators.py | 7 +- airflow/models/dag.py | 8 +- airflow/models/dagrun.py | 4 +- airflow/security/utils.py | 5 +- airflow/serialization/serde.py | 8 +- airflow/utils/cli.py | 4 +- airflow/utils/db_cleanup.py | 5 +- airflow/utils/email.py | 5 +- airflow/utils/file.py | 19 ++- airflow/utils/log/colored_log.py | 4 +- airflow/utils/log/logging_mixin.py | 5 +- airflow/utils/log/secrets_masker.py | 10 +- airflow/utils/task_group.py | 9 +- airflow/www/fab_security/manager.py | 8 +- airflow/www/views.py | 12 +- .../src/airflow_breeze/pre_commit_ids.py | 1 + images/breeze/output-commands-hash.txt | 2 +- images/breeze/output_static-checks.svg | 130 +++++++++--------- kubernetes_tests/test_base.py | 6 +- 29 files changed, 169 insertions(+), 152 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 55129ceaa1178..efcf6c6e3e245 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -929,6 +929,14 @@ repos: language: python pass_filenames: true files: ^tests/.*\.py$ + - id: check-usage-of-re2-over-re + language: pygrep + name: Use re2 over re + description: Use re2 module instead of re + entry: "^\\s*from re\\s|^\\s*import re\\s" + pass_filenames: true + files: \.py$ + exclude: ^airflow/providers|^dev/.*\.py$|^scripts/.*\.py$|^tests/|^docker_tests/|^docs/.*\.py$|^airflow/utils/helpers.py$ ## ADD MOST PRE-COMMITS ABOVE THAT LINE # The below pre-commits are those requiring CI image to be built - id: mypy-dev diff --git a/STATIC_CODE_CHECKS.rst b/STATIC_CODE_CHECKS.rst index 657976939ff5e..381dc53c93fb8 100644 --- a/STATIC_CODE_CHECKS.rst +++ b/STATIC_CODE_CHECKS.rst @@ -235,6 +235,8 @@ require Breeze Docker image to be built locally. +-----------------------------------------------------------+--------------------------------------------------------------+---------+ | check-urlparse-usage-in-code | Don't use urlparse in code | | +-----------------------------------------------------------+--------------------------------------------------------------+---------+ +| check-usage-of-re2-over-re | Use re2 over re | | ++-----------------------------------------------------------+--------------------------------------------------------------+---------+ | check-xml | Check XML files with xmllint | | +-----------------------------------------------------------+--------------------------------------------------------------+---------+ | codespell | Run codespell to check for common misspellings in files | | diff --git a/airflow/api_connexion/endpoints/provider_endpoint.py b/airflow/api_connexion/endpoints/provider_endpoint.py index c829d9c968d61..0a8594c28ae33 100644 --- a/airflow/api_connexion/endpoints/provider_endpoint.py +++ b/airflow/api_connexion/endpoints/provider_endpoint.py @@ -16,7 +16,7 @@ # under the License. from __future__ import annotations -import re +import re2 from airflow.api_connexion import security from airflow.api_connexion.schemas.provider_schema import ( @@ -30,7 +30,7 @@ def _remove_rst_syntax(value: str) -> str: - return re.sub("[`_<>]", "", value.strip(" \n.")) + return re2.sub("[`_<>]", "", value.strip(" \n.")) def _provider_mapper(provider: ProviderInfo) -> Provider: diff --git a/airflow/cli/commands/provider_command.py b/airflow/cli/commands/provider_command.py index 75c354d686032..876905c1c2309 100644 --- a/airflow/cli/commands/provider_command.py +++ b/airflow/cli/commands/provider_command.py @@ -17,7 +17,7 @@ """Providers sub-commands.""" from __future__ import annotations -import re +import re2 from airflow.cli.simple_table import AirflowConsole from airflow.providers_manager import ProvidersManager @@ -27,7 +27,7 @@ def _remove_rst_syntax(value: str) -> str: - return re.sub("[`_<>]", "", value.strip(" \n.")) + return re2.sub("[`_<>]", "", value.strip(" \n.")) @suppress_logs_and_warning diff --git a/airflow/cli/commands/user_command.py b/airflow/cli/commands/user_command.py index 7cf9c2b57f61e..a5a5be9787565 100644 --- a/airflow/cli/commands/user_command.py +++ b/airflow/cli/commands/user_command.py @@ -22,10 +22,10 @@ import json import os import random -import re import string from typing import Any +import re2 from marshmallow import Schema, fields, validate from marshmallow.exceptions import ValidationError @@ -164,7 +164,7 @@ def users_export(args): # In the User model the first and last name fields have underscores, # but the corresponding parameters in the CLI don't def remove_underscores(s): - return re.sub("_", "", s) + return re2.sub("_", "", s) users = [ { diff --git a/airflow/config_templates/default_celery.py b/airflow/config_templates/default_celery.py index b0390a82cf467..933d2bbdba484 100644 --- a/airflow/config_templates/default_celery.py +++ b/airflow/config_templates/default_celery.py @@ -19,9 +19,10 @@ from __future__ import annotations import logging -import re import ssl +import re2 + from airflow.configuration import conf from airflow.exceptions import AirflowConfigException, AirflowException @@ -88,7 +89,7 @@ def _broker_supports_visibility_timeout(url): "ca_certs": conf.get("celery", "SSL_CACERT"), "cert_reqs": ssl.CERT_REQUIRED, } - elif broker_url and re.search("rediss?://|sentinel://", broker_url): + elif broker_url and re2.search("rediss?://|sentinel://", broker_url): broker_use_ssl = { "ssl_keyfile": conf.get("celery", "SSL_KEY"), "ssl_certfile": conf.get("celery", "SSL_CERT"), @@ -114,7 +115,7 @@ def _broker_supports_visibility_timeout(url): f"all necessary certs and key ({e})." ) -if re.search("rediss?://|amqp://|rpc://", result_backend): +if re2.search("rediss?://|amqp://|rpc://", result_backend): log.warning( "You have configured a result_backend of %s, it is highly recommended " "to use an alternative result_backend (i.e. a database).", diff --git a/airflow/configuration.py b/airflow/configuration.py index 288595efbd94c..fbe080247bfad 100644 --- a/airflow/configuration.py +++ b/airflow/configuration.py @@ -23,7 +23,6 @@ import multiprocessing import os import pathlib -import re import shlex import stat import subprocess @@ -36,10 +35,10 @@ from configparser import _UNSET, ConfigParser, NoOptionError, NoSectionError # type: ignore from contextlib import contextmanager, suppress from json.decoder import JSONDecodeError -from re import Pattern -from typing import IO, Any, Dict, Iterable, Set, Tuple, Union +from typing import IO, Any, Dict, Iterable, Pattern, Set, Tuple, Union from urllib.parse import urlsplit +import re2 from typing_extensions import overload from airflow.exceptions import AirflowConfigException @@ -55,7 +54,7 @@ warnings.filterwarnings(action="default", category=DeprecationWarning, module="airflow") warnings.filterwarnings(action="default", category=PendingDeprecationWarning, module="airflow") -_SQLITE3_VERSION_PATTERN = re.compile(r"(?P^\d+(?:\.\d+)*)\D?.*$") +_SQLITE3_VERSION_PATTERN = re2.compile(r"(?P^\d+(?:\.\d+)*)\D?.*$") ConfigType = Union[str, int, float, bool] ConfigOptionsDictType = Dict[str, ConfigType] @@ -269,36 +268,36 @@ def inversed_deprecated_sections(self): # about. Mapping of section -> setting -> { old, replace, by_version } deprecated_values: dict[str, dict[str, tuple[Pattern, str, str]]] = { "core": { - "hostname_callable": (re.compile(r":"), r".", "2.1"), + "hostname_callable": (re2.compile(r":"), r".", "2.1"), }, "webserver": { - "navbar_color": (re.compile(r"\A#007A87\Z", re.IGNORECASE), "#fff", "2.1"), - "dag_default_view": (re.compile(r"^tree$"), "grid", "3.0"), + "navbar_color": (re2.compile(r"(?i)\A#007A87\z"), "#fff", "2.1"), + "dag_default_view": (re2.compile(r"^tree$"), "grid", "3.0"), }, "email": { "email_backend": ( - re.compile(r"^airflow\.contrib\.utils\.sendgrid\.send_email$"), + re2.compile(r"^airflow\.contrib\.utils\.sendgrid\.send_email$"), r"airflow.providers.sendgrid.utils.emailer.send_email", "2.1", ), }, "logging": { "log_filename_template": ( - re.compile(re.escape("{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log")), + re2.compile(re2.escape("{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log")), "XX-set-after-default-config-loaded-XX", "3.0", ), }, "api": { "auth_backends": ( - re.compile(r"^airflow\.api\.auth\.backend\.deny_all$|^$"), + re2.compile(r"^airflow\.api\.auth\.backend\.deny_all$|^$"), "airflow.api.auth.backend.session", "3.0", ), }, "elasticsearch": { "log_id_template": ( - re.compile("^" + re.escape("{dag_id}-{task_id}-{execution_date}-{try_number}") + "$"), + re2.compile("^" + re2.escape("{dag_id}-{task_id}-{execution_date}-{try_number}") + "$"), "{dag_id}-{task_id}-{run_id}-{map_index}-{try_number}", "3.0", ) @@ -425,7 +424,7 @@ def _upgrade_postgres_metastore_conn(self): FutureWarning, ) self.upgraded_values[(section, key)] = old_value - new_value = re.sub("^" + re.escape(f"{parsed.scheme}://"), f"{good_scheme}://", old_value) + new_value = re2.sub("^" + re2.escape(f"{parsed.scheme}://"), f"{good_scheme}://", old_value) self._update_env_var(section=section, name=key, new_value=new_value) # if the old value is set via env var, we need to wipe it diff --git a/airflow/decorators/base.py b/airflow/decorators/base.py index df890c036d562..7e3c8ba2c9e30 100644 --- a/airflow/decorators/base.py +++ b/airflow/decorators/base.py @@ -17,7 +17,6 @@ from __future__ import annotations import inspect -import re import warnings from functools import cached_property from itertools import chain @@ -38,6 +37,7 @@ ) import attr +import re2 import typing_extensions from sqlalchemy.orm import Session @@ -144,15 +144,15 @@ def get_unique_task_id( return task_id def _find_id_suffixes(dag: DAG) -> Iterator[int]: - prefix = re.split(r"__\d+$", tg_task_id)[0] + prefix = re2.split(r"__\d+$", tg_task_id)[0] for task_id in dag.task_ids: - match = re.match(rf"^{prefix}__(\d+)$", task_id) + match = re2.match(rf"^{prefix}__(\d+)$", task_id) if match is None: continue yield int(match.group(1)) yield 0 # Default if there's no matching task ID. - core = re.split(r"__\d+$", task_id)[0] + core = re2.split(r"__\d+$", task_id)[0] return f"{core}__{max(_find_id_suffixes(dag)) + 1}" diff --git a/airflow/kubernetes/pod_generator.py b/airflow/kubernetes/pod_generator.py index 91e079683abca..844a1abfd8a86 100644 --- a/airflow/kubernetes/pod_generator.py +++ b/airflow/kubernetes/pod_generator.py @@ -28,10 +28,10 @@ import datetime import logging import os -import re import warnings from functools import reduce +import re2 from dateutil import parser from kubernetes.client import models as k8s from kubernetes.client.api_client import ApiClient @@ -65,7 +65,7 @@ def make_safe_label_value(string: str) -> str: way from the original value sent to this function, then we need to truncate to 53 chars, and append it with a unique hash. """ - safe_label = re.sub(r"^[^a-z0-9A-Z]*|[^a-zA-Z0-9_\-\.]|[^a-z0-9A-Z]*$", "", string) + safe_label = re2.sub(r"^[^a-z0-9A-Z]*|[^a-zA-Z0-9_\-\.]|[^a-z0-9A-Z]*$", "", string) if len(safe_label) > MAX_LABEL_LEN or string != safe_label: safe_hash = md5(string.encode()).hexdigest()[:9] diff --git a/airflow/kubernetes/pod_generator_deprecated.py b/airflow/kubernetes/pod_generator_deprecated.py index 652c993d1ce93..8876556a8d748 100644 --- a/airflow/kubernetes/pod_generator_deprecated.py +++ b/airflow/kubernetes/pod_generator_deprecated.py @@ -25,9 +25,9 @@ from __future__ import annotations import copy -import re import uuid +import re2 from kubernetes.client import models as k8s from airflow.utils.hashlib_wrapper import md5 @@ -70,7 +70,7 @@ def make_safe_label_value(string): way from the original value sent to this function, then we need to truncate to 53 chars, and append it with a unique hash. """ - safe_label = re.sub(r"^[^a-z0-9A-Z]*|[^a-zA-Z0-9_\-\.]|[^a-z0-9A-Z]*$", "", string) + safe_label = re2.sub(r"^[^a-z0-9A-Z]*|[^a-zA-Z0-9_\-\.]|[^a-z0-9A-Z]*$", "", string) if len(safe_label) > MAX_LABEL_LEN or string != safe_label: safe_hash = md5(string.encode()).hexdigest()[:9] @@ -151,7 +151,6 @@ def __init__( extract_xcom: bool = False, priority_class_name: str | None = None, ): - self.pod = k8s.V1Pod() self.pod.api_version = "v1" self.pod.kind = "Pod" diff --git a/airflow/metrics/validators.py b/airflow/metrics/validators.py index 0965b534ff46b..0fd5fd1adef8c 100644 --- a/airflow/metrics/validators.py +++ b/airflow/metrics/validators.py @@ -21,12 +21,13 @@ import abc import logging -import re import string import warnings from functools import partial, wraps from typing import Callable, Iterable, Pattern, cast +import re2 + from airflow.configuration import conf from airflow.exceptions import InvalidStatsNameException @@ -78,7 +79,7 @@ class MetricNameLengthExemptionWarning(Warning): r"^dagrun\.schedule_delay\.(?P.*)$", r"^dagrun\.(?P.*)\.first_task_scheduling_delay$", } -BACK_COMPAT_METRIC_NAMES: set[Pattern[str]] = {re.compile(name) for name in BACK_COMPAT_METRIC_NAME_PATTERNS} +BACK_COMPAT_METRIC_NAMES: set[Pattern[str]] = {re2.compile(name) for name in BACK_COMPAT_METRIC_NAME_PATTERNS} OTEL_NAME_MAX_LENGTH = 63 @@ -132,7 +133,7 @@ def stat_name_otel_handler( # If the name is in the exceptions list, do not fail it for being too long. # It may still be deemed invalid for other reasons below. for exemption in BACK_COMPAT_METRIC_NAMES: - if re.match(exemption, stat_name): + if re2.match(exemption, stat_name): # There is a back-compat exception for this name; proceed name_length_exemption = True matched_exemption = exemption.pattern diff --git a/airflow/models/dag.py b/airflow/models/dag.py index 11468b59ea708..1ec78b64c3791 100644 --- a/airflow/models/dag.py +++ b/airflow/models/dag.py @@ -52,7 +52,7 @@ import jinja2 import pendulum -import re2 as re +import re2 from dateutil.relativedelta import relativedelta from pendulum.tz.timezone import Timezone from sqlalchemy import ( @@ -2361,7 +2361,7 @@ def partial_subset( dag = copy.deepcopy(self, memo) # type: ignore if isinstance(task_ids_or_regex, (str, Pattern)): - matched_tasks = [t for t in self.tasks if re.findall(task_ids_or_regex, t.task_id)] + matched_tasks = [t for t in self.tasks if re2.findall(task_ids_or_regex, t.task_id)] else: matched_tasks = [t for t in self.tasks if t.task_id in task_ids_or_regex] @@ -2828,8 +2828,8 @@ def create_dagrun( regex = airflow_conf.get("scheduler", "allowed_run_id_pattern") - if run_id and not re.match(RUN_ID_REGEX, run_id): - if not regex.strip() or not re.match(regex.strip(), run_id): + if run_id and not re2.match(RUN_ID_REGEX, run_id): + if not regex.strip() or not re2.match(regex.strip(), run_id): raise AirflowException( f"The provided run ID '{run_id}' is invalid. It does not match either " f"the configured pattern: '{regex}' or the built-in pattern: '{RUN_ID_REGEX}'" diff --git a/airflow/models/dagrun.py b/airflow/models/dagrun.py index 8f3b3a33018a5..47f48312e701f 100644 --- a/airflow/models/dagrun.py +++ b/airflow/models/dagrun.py @@ -24,7 +24,7 @@ from datetime import datetime from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, NamedTuple, Sequence, TypeVar, overload -import re2 as re +import re2 from sqlalchemy import ( Boolean, Column, @@ -248,7 +248,7 @@ def validate_run_id(self, key: str, run_id: str) -> str | None: if not run_id: return None regex = airflow_conf.get("scheduler", "allowed_run_id_pattern") - if not re.match(regex, run_id) and not re.match(RUN_ID_REGEX, run_id): + if not re2.match(regex, run_id) and not re2.match(RUN_ID_REGEX, run_id): raise ValueError( f"The run_id provided '{run_id}' does not match the pattern '{regex}' or '{RUN_ID_REGEX}'" ) diff --git a/airflow/security/utils.py b/airflow/security/utils.py index 1ffe0e0d29bac..139e96a13c928 100644 --- a/airflow/security/utils.py +++ b/airflow/security/utils.py @@ -34,9 +34,10 @@ # limitations under the License. # """Various security-related utils.""" -import re import socket +import re2 + from airflow.utils.net import get_hostname @@ -49,7 +50,7 @@ def get_components(principal) -> list[str] | None: """ if not principal: return None - return re.split(r"[/@]", str(principal)) + return re2.split(r"[/@]", str(principal)) def replace_hostname_pattern(components, host=None): diff --git a/airflow/serialization/serde.py b/airflow/serialization/serde.py index d996cb9b162f3..8d0ed100ac078 100644 --- a/airflow/serialization/serde.py +++ b/airflow/serialization/serde.py @@ -21,13 +21,13 @@ import enum import functools import logging -import re import sys from importlib import import_module from types import ModuleType -from typing import Any, TypeVar, Union, cast +from typing import Any, Pattern, TypeVar, Union, cast import attr +import re2 import airflow.serialization.serializers from airflow.configuration import conf @@ -357,9 +357,9 @@ def _register(): @functools.lru_cache(maxsize=None) -def _get_patterns() -> list[re.Pattern]: +def _get_patterns() -> list[Pattern]: patterns = conf.get("core", "allowed_deserialization_classes").split() - return [re.compile(re.sub(r"(\w)\.", r"\1\..", p)) for p in patterns] + return [re2.compile(re2.sub(r"(\w)\.", r"\1\..", p)) for p in patterns] _register() diff --git a/airflow/utils/cli.py b/airflow/utils/cli.py index 56ac166b5441a..c800ed71fcd8f 100644 --- a/airflow/utils/cli.py +++ b/airflow/utils/cli.py @@ -21,7 +21,6 @@ import functools import logging import os -import re import socket import sys import threading @@ -32,6 +31,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Callable, TypeVar, cast +import re2 from sqlalchemy.orm import Session from airflow import settings @@ -252,7 +252,7 @@ def get_dags(subdir: str | None, dag_id: str, use_regex: bool = False): if not use_regex: return [get_dag(subdir, dag_id)] dagbag = DagBag(process_subdir(subdir)) - matched_dags = [dag for dag in dagbag.dags.values() if re.search(dag_id, dag.dag_id)] + matched_dags = [dag for dag in dagbag.dags.values() if re2.search(dag_id, dag.dag_id)] if not matched_dags: raise AirflowException( f"dag_id could not be found with regex: {dag_id}. Either the dag did not exist or " diff --git a/airflow/utils/db_cleanup.py b/airflow/utils/db_cleanup.py index 4e91352572292..22fb0020e72a7 100644 --- a/airflow/utils/db_cleanup.py +++ b/airflow/utils/db_cleanup.py @@ -141,13 +141,14 @@ def _dump_table_to_file(*, target_table, file_path, export_format, session): def _do_delete(*, query, orm_model, skip_archive, session): - import re from datetime import datetime + import re2 + print("Performing Delete...") # using bulk delete # create a new table and copy the rows there - timestamp_str = re.sub(r"[^\d]", "", datetime.utcnow().isoformat())[:14] + timestamp_str = re2.sub(r"[^\d]", "", datetime.utcnow().isoformat())[:14] target_table_name = f"{ARCHIVE_TABLE_PREFIX}{orm_model.name}__{timestamp_str}" print(f"Moving data to table {target_table_name}") bind = session.get_bind() diff --git a/airflow/utils/email.py b/airflow/utils/email.py index ee570399e7f21..e807b8f75d9f8 100644 --- a/airflow/utils/email.py +++ b/airflow/utils/email.py @@ -20,7 +20,6 @@ import collections.abc import logging import os -import re import smtplib import warnings from email.mime.application import MIMEApplication @@ -29,6 +28,8 @@ from email.utils import formatdate from typing import Any, Iterable +import re2 + from airflow.configuration import conf from airflow.exceptions import AirflowConfigException, AirflowException, RemovedInAirflow3Warning @@ -328,4 +329,4 @@ def _get_email_list_from_str(addresses: str) -> list[str]: :return: A list of email addresses. """ pattern = r"\s*[,;]\s*" - return [address for address in re.split(pattern, addresses)] + return [address for address in re2.split(pattern, addresses)] diff --git a/airflow/utils/file.py b/airflow/utils/file.py index 4a54adbe48b4f..08b4048ad58af 100644 --- a/airflow/utils/file.py +++ b/airflow/utils/file.py @@ -21,12 +21,12 @@ import io import logging import os -import re import zipfile from collections import OrderedDict from pathlib import Path -from typing import Generator, NamedTuple, Protocol, overload +from typing import Generator, NamedTuple, Pattern, Protocol, overload +import re2 from pathspec.patterns import GitWildMatchPattern from airflow.configuration import conf @@ -53,15 +53,15 @@ def match(path: Path, rules: list[_IgnoreRule]) -> bool: class _RegexpIgnoreRule(NamedTuple): """Typed namedtuple with utility functions for regexp ignore rules.""" - pattern: re.Pattern + pattern: Pattern base_dir: Path @staticmethod def compile(pattern: str, base_dir: Path, definition_file: Path) -> _IgnoreRule | None: """Build an ignore rule from the supplied regexp pattern and log a useful warning if it is invalid.""" try: - return _RegexpIgnoreRule(re.compile(pattern), base_dir) - except re.error as e: + return _RegexpIgnoreRule(re2.compile(pattern), base_dir) + except re2.error as e: log.warning("Ignoring invalid regex '%s' from %s: %s", pattern, definition_file, e) return None @@ -79,7 +79,7 @@ def match(path: Path, rules: list[_IgnoreRule]) -> bool: class _GlobIgnoreRule(NamedTuple): """Typed namedtuple with utility functions for glob ignore rules.""" - pattern: re.Pattern + pattern: Pattern raw_pattern: str include: bool | None = None relative_to: Path | None = None @@ -150,7 +150,7 @@ def mkdirs(path, mode): Path(path).mkdir(mode=mode, parents=True, exist_ok=True) -ZIP_REGEX = re.compile(rf"((.*\.zip){re.escape(os.sep)})?(.*)") +ZIP_REGEX = re2.compile(rf"((.*\.zip){re2.escape(os.sep)})?(.*)") @overload @@ -191,7 +191,6 @@ def open_maybe_zipped(fileloc, mode="r"): if archive and zipfile.is_zipfile(archive): return io.TextIOWrapper(zipfile.ZipFile(archive, mode=mode).open(filename)) else: - return open(fileloc, mode=mode) @@ -217,7 +216,7 @@ def _find_path_from_directory( ignore_file_path = Path(root) / ignore_file_name if ignore_file_path.is_file(): with open(ignore_file_path) as ifile: - lines_no_comments = [re.sub(r"\s*#.*", "", line) for line in ifile.read().split("\n")] + lines_no_comments = [re2.sub(r"\s*#.*", "", line) for line in ifile.read().split("\n")] # append new patterns and filter out "None" objects, which are invalid patterns patterns += [ p @@ -327,7 +326,7 @@ def find_dag_file_paths(directory: str | os.PathLike[str], safe_mode: bool) -> l return file_paths -COMMENT_PATTERN = re.compile(r"\s*#.*") +COMMENT_PATTERN = re2.compile(r"\s*#.*") def might_contain_dag(file_path: str, safe_mode: bool, zip_file: zipfile.ZipFile | None = None) -> bool: diff --git a/airflow/utils/log/colored_log.py b/airflow/utils/log/colored_log.py index 6df2c98fcd138..aea7c255e618e 100644 --- a/airflow/utils/log/colored_log.py +++ b/airflow/utils/log/colored_log.py @@ -18,11 +18,11 @@ """Class responsible for colouring logs based on log level.""" from __future__ import annotations -import re import sys from logging import LogRecord from typing import Any +import re2 from colorlog import TTYColoredFormatter from colorlog.escape_codes import esc, escape_codes @@ -61,7 +61,7 @@ def _color_arg(arg: Any) -> str | float | int: @staticmethod def _count_number_of_arguments_in_message(record: LogRecord) -> int: - matches = re.findall(r"%.", record.msg) + matches = re2.findall(r"%.", record.msg) return len(matches) if matches else 0 def _color_record_args(self, record: LogRecord) -> LogRecord: diff --git a/airflow/utils/log/logging_mixin.py b/airflow/utils/log/logging_mixin.py index 02f89f2812174..646c73f1b4bc0 100644 --- a/airflow/utils/log/logging_mixin.py +++ b/airflow/utils/log/logging_mixin.py @@ -20,14 +20,15 @@ import abc import enum import logging -import re import sys from io import IOBase from logging import Handler, Logger, StreamHandler from typing import IO, Any, TypeVar, cast +import re2 + # 7-bit C1 ANSI escape sequences -ANSI_ESCAPE = re.compile(r"\x1B[@-_][0-?]*[ -/]*[@-~]") +ANSI_ESCAPE = re2.compile(r"\x1B[@-_][0-?]*[ -/]*[@-~]") # Private: A sentinel objects diff --git a/airflow/utils/log/secrets_masker.py b/airflow/utils/log/secrets_masker.py index 54547c1e6e6e1..2c5e4c47e0d4e 100644 --- a/airflow/utils/log/secrets_masker.py +++ b/airflow/utils/log/secrets_masker.py @@ -19,7 +19,6 @@ import collections.abc import logging -import re import sys from functools import cached_property from typing import ( @@ -31,12 +30,15 @@ Iterable, Iterator, List, + Pattern, TextIO, Tuple, TypeVar, Union, ) +import re2 + from airflow import settings from airflow.compat.functools import cache from airflow.typing_compat import TypeGuard @@ -144,7 +146,7 @@ def _is_v1_env_var(v: Any) -> TypeGuard[V1EnvVar]: class SecretsMasker(logging.Filter): """Redact secrets from logs.""" - replacer: re.Pattern | None = None + replacer: Pattern | None = None patterns: set[str] ALREADY_FILTERED_FLAG = "__SecretsMasker_filtered" @@ -332,13 +334,13 @@ def add_mask(self, secret: str | dict | Iterable, name: str | None = None): new_mask = False for s in self._adaptations(secret): if s: - pattern = re.escape(s) + pattern = re2.escape(s) if pattern not in self.patterns and (not name or should_hide_value_for_key(name)): self.patterns.add(pattern) new_mask = True if new_mask: - self.replacer = re.compile("|".join(self.patterns)) + self.replacer = re2.compile("|".join(self.patterns)) elif isinstance(secret, collections.abc.Iterable): for v in secret: diff --git a/airflow/utils/task_group.py b/airflow/utils/task_group.py index 58af85bb75119..d59ecaf40b4d2 100644 --- a/airflow/utils/task_group.py +++ b/airflow/utils/task_group.py @@ -24,10 +24,11 @@ import copy import functools import operator -import re import weakref from typing import TYPE_CHECKING, Any, Generator, Iterator, Sequence +import re2 + from airflow.compat.functools import cache from airflow.exceptions import ( AirflowDagCycleException, @@ -166,11 +167,11 @@ def _check_for_group_id_collisions(self, add_suffix_on_collision: bool): if self._group_id in self.used_group_ids: if not add_suffix_on_collision: raise DuplicateTaskIdFound(f"group_id '{self._group_id}' has already been added to the DAG") - base = re.split(r"__\d+$", self._group_id)[0] + base = re2.split(r"__\d+$", self._group_id)[0] suffixes = sorted( - int(re.split(r"^.+__", used_group_id)[1]) + int(re2.split(r"^.+__", used_group_id)[1]) for used_group_id in self.used_group_ids - if used_group_id is not None and re.match(rf"^{base}__\d+$", used_group_id) + if used_group_id is not None and re2.match(rf"^{base}__\d+$", used_group_id) ) if not suffixes: self._group_id += "__1" diff --git a/airflow/www/fab_security/manager.py b/airflow/www/fab_security/manager.py index 7145128e21b50..be0c4c63e39fd 100644 --- a/airflow/www/fab_security/manager.py +++ b/airflow/www/fab_security/manager.py @@ -22,11 +22,11 @@ import datetime import json import logging -import re from functools import cached_property from typing import Any from uuid import uuid4 +import re2 from flask import Flask, current_app, g, session, url_for from flask_appbuilder import AppBuilder from flask_appbuilder.const import ( @@ -703,7 +703,7 @@ def get_oauth_user_info(self, provider, resp): def _azure_parse_jwt(self, id_token): jwt_token_parts = r"^([^\.\s]*)\.([^\.\s]+)\.([^\.\s]*)$" - matches = re.search(jwt_token_parts, id_token) + matches = re2.search(jwt_token_parts, id_token) if not matches or len(matches.groups()) < 3: log.error("Unable to parse token.") return {} @@ -1375,8 +1375,8 @@ def auth_user_oauth(self, userinfo): def _has_access_builtin_roles(self, role, action_name: str, resource_name: str) -> bool: """Checks permission on builtin role.""" perms = self.builtin_roles.get(role.name, []) - for (_resource_name, _action_name) in perms: - if re.match(_resource_name, resource_name) and re.match(_action_name, action_name): + for _resource_name, _action_name in perms: + if re2.match(_resource_name, resource_name) and re2.match(_action_name, action_name): return True return False diff --git a/airflow/www/views.py b/airflow/www/views.py index 3f0965da38898..aea29e41d84d9 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -24,7 +24,6 @@ import json import logging import math -import re import sys import traceback import warnings @@ -39,6 +38,7 @@ import flask.json import lazy_object_proxy import nvd3 +import re2 import sqlalchemy as sqla from croniter import croniter from flask import ( @@ -2093,8 +2093,8 @@ def trigger(self, dag_id: str, session: Session = NEW_SESSION): return redirect(origin) regex = conf.get("scheduler", "allowed_run_id_pattern") - if run_id and not re.match(RUN_ID_REGEX, run_id): - if not regex.strip() or not re.match(regex.strip(), run_id): + if run_id and not re2.match(RUN_ID_REGEX, run_id): + if not regex.strip() or not re2.match(regex.strip(), run_id): flash( f"The provided run ID '{run_id}' is invalid. It does not match either " f"the configured pattern: '{regex}' or the built-in pattern: '{RUN_ID_REGEX}'", @@ -4709,7 +4709,7 @@ def action_mulduplicate(self, connections, session: Session = NEW_SESSION): """Duplicate Multiple connections.""" for selected_conn in connections: new_conn_id = selected_conn.conn_id - match = re.search(r"_copy(\d+)$", selected_conn.conn_id) + match = re2.search(r"_copy(\d+)$", selected_conn.conn_id) base_conn_id = selected_conn.conn_id if match: @@ -4951,8 +4951,8 @@ def _build_link(match_obj): return Markup(f'{text}') cd = escape(description) - cd = re.sub(r"`(.*)[\s+]+<(.*)>`__", _build_link, cd) - cd = re.sub(r"\n", r"
", cd) + cd = re2.sub(r"`(.*)[\s+]+<(.*)>`__", _build_link, cd) + cd = re2.sub(r"\n", r"
", cd) return Markup(cd) diff --git a/dev/breeze/src/airflow_breeze/pre_commit_ids.py b/dev/breeze/src/airflow_breeze/pre_commit_ids.py index b4c713360ed86..1349a962107e2 100644 --- a/dev/breeze/src/airflow_breeze/pre_commit_ids.py +++ b/dev/breeze/src/airflow_breeze/pre_commit_ids.py @@ -71,6 +71,7 @@ "check-system-tests-tocs", "check-tests-unittest-testcase", "check-urlparse-usage-in-code", + "check-usage-of-re2-over-re", "check-xml", "codespell", "compile-www-assets", diff --git a/images/breeze/output-commands-hash.txt b/images/breeze/output-commands-hash.txt index 61e50641f2b6e..241dc9bd43f6c 100644 --- a/images/breeze/output-commands-hash.txt +++ b/images/breeze/output-commands-hash.txt @@ -60,7 +60,7 @@ setup:version:123b462a421884dc2320ffc5e54b2478 setup:201c30ea237ea013a9a209a77092a2e8 shell:13f90c5749811e2f00e24d95e44e946d start-airflow:22c118d58b13a9d190e966bed5bb8ed8 -static-checks:9985d1db64592e29ab71b8a000ce302e +static-checks:a0f6ae35129b99b88ce65aede2c3a150 testing:docker-compose-tests:70167e67853cacd9ca784695d65a7846 testing:helm-tests:936cf28fd84ce4ff5113795fdae9624b testing:integration-tests:35f0ac57157bf8fe227fd080cf216622 diff --git a/images/breeze/output_static-checks.svg b/images/breeze/output_static-checks.svg index e9f5238acee09..e735b2afc3064 100644 --- a/images/breeze/output_static-checks.svg +++ b/images/breeze/output_static-checks.svg @@ -35,8 +35,8 @@ .breeze-static-checks-r1 { fill: #c5c8c6;font-weight: bold } .breeze-static-checks-r2 { fill: #c5c8c6 } .breeze-static-checks-r3 { fill: #d0b344;font-weight: bold } -.breeze-static-checks-r4 { fill: #68a0b3;font-weight: bold } -.breeze-static-checks-r5 { fill: #868887 } +.breeze-static-checks-r4 { fill: #868887 } +.breeze-static-checks-r5 { fill: #68a0b3;font-weight: bold } .breeze-static-checks-r6 { fill: #98a84b;font-weight: bold } .breeze-static-checks-r7 { fill: #8d7b39 } @@ -256,72 +256,72 @@ -Usage: breeze static-checks [OPTIONS] [PRECOMMIT_ARGS]... +Usage: breeze static-checks [OPTIONS] [PRECOMMIT_ARGS]... Run static checks. -╭─ Pre-commit flags ───────────────────────────────────────────────────────────────────────────────────────────────────╮ ---type-tType(s) of the static checks to run.                                              -(all | black | blacken-docs | check-aiobotocore-optional |                        -check-airflow-config-yaml-consistent | check-airflow-provider-compatibility |     -check-apache-license-rat | check-base-operator-partial-arguments |                -check-base-operator-usage | check-boring-cyborg-configuration |                   -check-breeze-top-dependencies-limited | check-builtin-literals |                  -check-changelog-has-no-duplicates | check-core-deprecation-classes |              -check-daysago-import-from-utils | check-decorated-operator-implements-custom-name -| check-docstring-param-types | check-example-dags-urls |                         -check-executables-have-shebangs | check-extra-packages-references |               -check-extras-order | check-for-inclusive-language | check-hooks-apply |           -check-incorrect-use-of-LoggingMixin | check-init-decorator-arguments |            -check-lazy-logging | check-links-to-example-dags-do-not-use-hardcoded-versions |  -check-merge-conflict | check-newsfragments-are-valid |                            -check-no-airflow-deprecation-in-providers | check-no-providers-in-core-examples | -check-no-relative-imports | check-only-new-session-with-provide-session |         -check-persist-credentials-disabled-in-github-workflows |                          -check-pre-commit-information-consistent | check-provide-create-sessions-imports | -check-provider-yaml-valid | check-providers-init-file-missing |                   -check-providers-subpackages-init-file-exist | check-pydevd-left-in-code |         -check-revision-heads-map | check-safe-filter-usage-in-html | check-setup-order |  -check-start-date-not-used-in-defaults | check-system-tests-present |              -check-system-tests-tocs | check-tests-unittest-testcase |                         -check-urlparse-usage-in-code | check-xml | codespell | compile-www-assets |       -compile-www-assets-dev | create-missing-init-py-files-tests | debug-statements |  -detect-private-key | doctoc | end-of-file-fixer | fix-encoding-pragma | flynt |   -identity | insert-license | lint-chart-schema | lint-css | lint-dockerfile |      -lint-helm-chart | lint-json-schema | lint-markdown | lint-openapi |               -mixed-line-ending | mypy-core | mypy-dev | mypy-docs | mypy-providers |           -pretty-format-json | python-no-log-warn | replace-bad-characters | rst-backticks  -| ruff | shellcheck | trailing-whitespace | ts-compile-format-lint-www |          -update-black-version | update-breeze-cmd-output |                                 -update-breeze-readme-config-hash | update-common-sql-api-stubs |                  -update-er-diagram | update-extras | update-in-the-wild-to-be-sorted |             -update-inlined-dockerfile-scripts | update-installed-providers-to-be-sorted |     -update-local-yml-file | update-migration-references |                             -update-providers-dependencies | update-spelling-wordlist-to-be-sorted |           -update-supported-versions | update-vendored-in-k8s-json-schema | update-version | -yamllint)                                                                         ---show-diff-on-failure-sShow diff for files modified by the checks. ---initialize-environmentInitialize environment before running checks. ---max-initialization-attemptsMaximum number of attempts to initialize environment before giving up. -(INTEGER RANGE)                                                        -[default: 3; 1<=x<=10]                                                 ---github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Selecting files to run the checks on ───────────────────────────────────────────────────────────────────────────────╮ ---file-fList of files to run the checks on.(PATH) ---all-files-aRun checks on all files. ---commit-ref-rRun checks for this commit reference only (can be any git commit-ish reference). Mutually     -exclusive with --last-commit.                                                                 -(TEXT)                                                                                        ---last-commit-cRun checks for all files in last commit. Mutually exclusive with --commit-ref. ---only-my-changes-mRun checks for commits belonging to my PR only: for all commits between merge base to `main`  -branch and HEAD of your branch.                                                               -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Common options ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---verbose-vPrint verbose information about performed steps. ---dry-run-DIf dry-run is set, commands are only printed, not executed. ---help-hShow this message and exit. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Pre-commit flags ───────────────────────────────────────────────────────────────────────────────────────────────────╮ +--type-tType(s) of the static checks to run.                                              +(all | black | blacken-docs | check-aiobotocore-optional |                        +check-airflow-config-yaml-consistent | check-airflow-provider-compatibility |     +check-apache-license-rat | check-base-operator-partial-arguments |                +check-base-operator-usage | check-boring-cyborg-configuration |                   +check-breeze-top-dependencies-limited | check-builtin-literals |                  +check-changelog-has-no-duplicates | check-core-deprecation-classes |              +check-daysago-import-from-utils | check-decorated-operator-implements-custom-name +| check-docstring-param-types | check-example-dags-urls |                         +check-executables-have-shebangs | check-extra-packages-references |               +check-extras-order | check-for-inclusive-language | check-hooks-apply |           +check-incorrect-use-of-LoggingMixin | check-init-decorator-arguments |            +check-lazy-logging | check-links-to-example-dags-do-not-use-hardcoded-versions |  +check-merge-conflict | check-newsfragments-are-valid |                            +check-no-airflow-deprecation-in-providers | check-no-providers-in-core-examples | +check-no-relative-imports | check-only-new-session-with-provide-session |         +check-persist-credentials-disabled-in-github-workflows |                          +check-pre-commit-information-consistent | check-provide-create-sessions-imports | +check-provider-yaml-valid | check-providers-init-file-missing |                   +check-providers-subpackages-init-file-exist | check-pydevd-left-in-code |         +check-revision-heads-map | check-safe-filter-usage-in-html | check-setup-order |  +check-start-date-not-used-in-defaults | check-system-tests-present |              +check-system-tests-tocs | check-tests-unittest-testcase |                         +check-urlparse-usage-in-code | check-usage-of-re2-over-re | check-xml | codespell +| compile-www-assets | compile-www-assets-dev |                                   +create-missing-init-py-files-tests | debug-statements | detect-private-key |      +doctoc | end-of-file-fixer | fix-encoding-pragma | flynt | identity |             +insert-license | lint-chart-schema | lint-css | lint-dockerfile | lint-helm-chart +| lint-json-schema | lint-markdown | lint-openapi | mixed-line-ending | mypy-core +| mypy-dev | mypy-docs | mypy-providers | pretty-format-json | python-no-log-warn +| replace-bad-characters | rst-backticks | ruff | shellcheck |                    +trailing-whitespace | ts-compile-format-lint-www | update-black-version |         +update-breeze-cmd-output | update-breeze-readme-config-hash |                     +update-common-sql-api-stubs | update-er-diagram | update-extras |                 +update-in-the-wild-to-be-sorted | update-inlined-dockerfile-scripts |             +update-installed-providers-to-be-sorted | update-local-yml-file |                 +update-migration-references | update-providers-dependencies |                     +update-spelling-wordlist-to-be-sorted | update-supported-versions |               +update-vendored-in-k8s-json-schema | update-version | yamllint)                   +--show-diff-on-failure-sShow diff for files modified by the checks. +--initialize-environmentInitialize environment before running checks. +--max-initialization-attemptsMaximum number of attempts to initialize environment before giving up. +(INTEGER RANGE)                                                        +[default: 3; 1<=x<=10]                                                 +--github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Selecting files to run the checks on ───────────────────────────────────────────────────────────────────────────────╮ +--file-fList of files to run the checks on.(PATH) +--all-files-aRun checks on all files. +--commit-ref-rRun checks for this commit reference only (can be any git commit-ish reference). Mutually     +exclusive with --last-commit.                                                                 +(TEXT)                                                                                        +--last-commit-cRun checks for all files in last commit. Mutually exclusive with --commit-ref. +--only-my-changes-mRun checks for commits belonging to my PR only: for all commits between merge base to `main`  +branch and HEAD of your branch.                                                               +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Common options ─────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--verbose-vPrint verbose information about performed steps. +--dry-run-DIf dry-run is set, commands are only printed, not executed. +--help-hShow this message and exit. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/kubernetes_tests/test_base.py b/kubernetes_tests/test_base.py index dfbf640fd6485..5b45969c3d287 100644 --- a/kubernetes_tests/test_base.py +++ b/kubernetes_tests/test_base.py @@ -17,7 +17,6 @@ from __future__ import annotations import os -import re import subprocess import tempfile import time @@ -26,6 +25,7 @@ from subprocess import check_call, check_output import pytest +import re2 import requests import requests.exceptions from requests.adapters import HTTPAdapter @@ -103,7 +103,7 @@ def _describe_resources(self, namespace: str): def _num_pods_in_namespace(namespace): air_pod = check_output(["kubectl", "get", "pods", "-n", namespace]).decode() air_pod = air_pod.split("\n") - names = [re.compile(r"\s+").split(x)[0] for x in air_pod if "airflow" in x] + names = [re2.compile(r"\s+").split(x)[0] for x in air_pod if "airflow" in x] return len(names) @staticmethod @@ -111,7 +111,7 @@ def _delete_airflow_pod(name=""): suffix = "-" + name if name else "" air_pod = check_output(["kubectl", "get", "pods"]).decode() air_pod = air_pod.split("\n") - names = [re.compile(r"\s+").split(x)[0] for x in air_pod if "airflow" + suffix in x] + names = [re2.compile(r"\s+").split(x)[0] for x in air_pod if "airflow" + suffix in x] if names: check_call(["kubectl", "delete", "pod", names[0]]) From 542d9d88c2f75417085fecad987a3804b0013b7f Mon Sep 17 00:00:00 2001 From: pierrejeambrun Date: Wed, 5 Jul 2023 01:23:51 +0200 Subject: [PATCH 2/2] Fix CI --- tests/utils/log/test_secrets_masker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/log/test_secrets_masker.py b/tests/utils/log/test_secrets_masker.py index 9dad28bf4518a..e47cd5cee57e0 100644 --- a/tests/utils/log/test_secrets_masker.py +++ b/tests/utils/log/test_secrets_masker.py @@ -393,7 +393,7 @@ class TestMaskSecretAdapter: def reset_secrets_masker_and_skip_escape(self): self.secrets_masker = SecretsMasker() with patch("airflow.utils.log.secrets_masker._secrets_masker", return_value=self.secrets_masker): - with patch("airflow.utils.log.secrets_masker.re.escape", lambda x: x): + with patch("airflow.utils.log.secrets_masker.re2.escape", lambda x: x): yield def test_calling_mask_secret_adds_adaptations_for_returned_str(self):