diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4f41e866..df1cfb3e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,7 +3,6 @@ name: CI on: push: pull_request: - workflow_dispatch: jobs: checks: @@ -11,6 +10,9 @@ jobs: steps: - name: Checkout working copy uses: actions/checkout@v4 + with: + submodules: true + fetch-depth: 0 - name: ruff check uses: chartboost/ruff-action@v1 - name: ruff format @@ -29,7 +31,7 @@ jobs: if: ${{ always() && steps.setup_python.conclusion == 'success' }} run: | python -mpip install --upgrade pip - python -mpip install mypy types-PyYaml + python -mpip install mypy types-PyYaml ./ua-parser-builtins - name: mypy if: ${{ always() && steps.install_mypy.conclusion == 'success' }} run: mypy @@ -101,6 +103,7 @@ jobs: uses: actions/checkout@v4 with: submodules: true + fetch-depth: 0 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: @@ -115,6 +118,7 @@ jobs: sudo apt install libyaml-dev fi - run: python -mpip install pytest pyyaml + - run: python -mpip install ./ua-parser-builtins # install rs accelerator if available, ignore if not - run: python -mpip install ua-parser-rs || true # re2 is basically impossible to install from source so don't diff --git a/pyproject.toml b/pyproject.toml index 65271a4c..c0d4192c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ description = "Python port of Browserscope's user agent parser" version = "1.0.0a1" readme = "README.rst" requires-python = ">=3.9" -dependencies = [] +dependencies = ["ua-parser-builtins"] license = {text = "Apache 2.0"} urls = {repository = "https://github.com/ua-parser/uap-python"} @@ -57,8 +57,7 @@ where = ["src"] [tool.ruff] exclude = [ - "src/ua_parser/_lazy.py", - "src/ua_parser/_matchers.py", + "src/ua_parser/generate_builtins.py", ] [tool.ruff.lint] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 9b07aee0..00000000 --- a/setup.cfg +++ /dev/null @@ -1,8 +0,0 @@ -[options] -packages = find: -package_dir = - =src -setup_requires = pyyaml - -[options.packages.find] -where = src diff --git a/setup.py b/setup.py deleted file mode 100644 index f423348e..00000000 --- a/setup.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python -# flake8: noqa -import io -from contextlib import suppress, contextmanager -from os import fspath -from pathlib import Path -from typing import Optional, List, Dict - -from setuptools import setup, Command, find_namespace_packages -from setuptools.command.build import build, SubCommand -from setuptools.command.editable_wheel import editable_wheel - -import yaml - - -build.sub_commands.insert(0, ("compile-regexes", None)) - - -class CompileRegexes(Command, SubCommand): - def initialize_options(self) -> None: - self.pkg_name: Optional[str] = None - - def finalize_options(self) -> None: - self.pkg_name = self.distribution.get_name().replace("-", "_") - - def get_source_files(self) -> List[str]: - return ["uap-core/regexes.yaml"] - - def get_outputs(self) -> List[str]: - return [f"{self.pkg_name}/_regexes.py"] - - def get_output_mapping(self) -> Dict[str, str]: - return dict(zip(self.get_source_files(), self.get_outputs())) - - def run(self) -> None: - # FIXME: check git / submodules? - """ - work_path = self.work_path - if not os.path.exists(os.path.join(work_path, ".git")): - return - - log.info("initializing git submodules") - check_output(["git", "submodule", "init"], cwd=work_path) - check_output(["git", "submodule", "update"], cwd=work_path) - """ - if not self.pkg_name: - return # or error? - - yaml_src = Path("uap-core", "regexes.yaml") - if not yaml_src.is_file(): - raise RuntimeError( - f"Unable to find regexes.yaml, should be at {yaml_src!r}" - ) - - with yaml_src.open("rb") as f: - regexes = yaml.safe_load(f) - - if self.editable_mode: - dist_dir = Path("src") - else: - dist_dir = Path(self.get_finalized_command("bdist_wheel").bdist_dir) - - outdir = dist_dir / self.pkg_name - outdir.mkdir(parents=True, exist_ok=True) - - dest = outdir / "_matchers.py" - dest_lazy = outdir / "_lazy.py" - dest_legacy = outdir / "_regexes.py" - - with ( - dest.open("wb") as eager, - dest_lazy.open("wb") as lazy, - dest_legacy.open("wb") as legacy, - ): - eager = EagerWriter(eager) - lazy = LazyWriter(lazy) - legacy = LegacyWriter(legacy) - - for section in ["user_agent_parsers", "os_parsers", "device_parsers"]: - with ( - eager.section(section), - lazy.section(section), - legacy.section(section), - ): - extract = EXTRACTORS[section] - for p in regexes[section]: - el = trim(extract(p)) - eager.item(el) - lazy.item(el) - legacy.item(el) - eager.end() - lazy.end() - legacy.end() - - -def trim(l): - while len(l) > 1 and l[-1] is None: - l.pop() - return l - - -EXTRACTORS = { - "user_agent_parsers": lambda p: [ - p["regex"], - p.get("family_replacement"), - p.get("v1_replacement"), - p.get("v2_replacement"), - ], - "os_parsers": lambda p: [ - p["regex"], - p.get("os_replacement"), - p.get("os_v1_replacement"), - p.get("os_v2_replacement"), - p.get("os_v3_replacement"), - p.get("os_v4_replacement"), - ], - "device_parsers": lambda p: [ - p["regex"], - p.get("regex_flag"), - p.get("device_replacement"), - p.get("brand_replacement"), - p.get("model_replacement"), - ], -} - - -class Writer: - section_end = b"" - - def __init__(self, fp): - self.fp = fp - self.fp.write( - b"""\ -######################################################## -# NOTICE: this file is autogenerated from regexes.yaml # -######################################################## -""" - ) - self.fp.write(self.prefix) - self._section = None - - @contextmanager - def section(self, id): - self._section = id - self.fp.write(self.sections[id]) - yield - self.fp.write(self.section_end) - - def item(self, elements): - # DeviceMatcher(re, flag, repl1), - self.fp.write(self.items[self._section]) - self.fp.write(", ".join(map(repr, elements)).encode()) - self.fp.write(b"),\n") - - def end(self): - self.fp.write(self.suffix) - - -class LegacyWriter(Writer): - prefix = b"""\ -__all__ = [ - "USER_AGENT_PARSERS", - "DEVICE_PARSERS", - "OS_PARSERS", -] - -from .user_agent_parser import UserAgentParser, DeviceParser, OSParser - -""" - sections = { - "user_agent_parsers": b"USER_AGENT_PARSERS = [\n", - "os_parsers": b"\n\nOS_PARSERS = [\n", - "device_parsers": b"\n\nDEVICE_PARSERS = [\n", - } - section_end = b"]" - items = { - "user_agent_parsers": b" UserAgentParser(", - "os_parsers": b" OSParser(", - "device_parsers": b" DeviceParser(", - } - suffix = b"\n" - - -class EagerWriter(Writer): - prefix = b"""\ -__all__ = ["MATCHERS"] - -from typing import Tuple, List -from .matchers import UserAgentMatcher, OSMatcher, DeviceMatcher - -MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([ -""" - sections = { - "user_agent_parsers": b"", - "os_parsers": b"], [\n", - "device_parsers": b"], [\n", - } - items = { - "user_agent_parsers": b" UserAgentMatcher(", - "os_parsers": b" OSMatcher(", - "device_parsers": b" DeviceMatcher(", - } - suffix = b"])\n" - - -class LazyWriter(EagerWriter): - prefix = b"""\ -__all__ = ["MATCHERS"] - -from typing import Tuple, List -from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher - -MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([ -""" - - -setup( - cmdclass={ - "compile-regexes": CompileRegexes, - } -) diff --git a/src/ua_parser/_lazy.pyi b/src/ua_parser/_lazy.pyi deleted file mode 100644 index 741db1af..00000000 --- a/src/ua_parser/_lazy.pyi +++ /dev/null @@ -1,11 +0,0 @@ -__all__ = ["MATCHERS"] - -from typing import List, Tuple - -from .lazy import DeviceMatcher, OSMatcher, UserAgentMatcher - -MATCHERS: Tuple[ - List[UserAgentMatcher], - List[OSMatcher], - List[DeviceMatcher], -] diff --git a/src/ua_parser/_matchers.pyi b/src/ua_parser/_matchers.pyi deleted file mode 100644 index 2269fb43..00000000 --- a/src/ua_parser/_matchers.pyi +++ /dev/null @@ -1,11 +0,0 @@ -__all__ = ["MATCHERS"] - -from typing import List, Tuple - -from .matchers import DeviceMatcher, OSMatcher, UserAgentMatcher - -MATCHERS: Tuple[ - List[UserAgentMatcher], - List[OSMatcher], - List[DeviceMatcher], -] diff --git a/src/ua_parser/_regexes.pyi b/src/ua_parser/_regexes.pyi deleted file mode 100644 index 10bc2ef4..00000000 --- a/src/ua_parser/_regexes.pyi +++ /dev/null @@ -1,7 +0,0 @@ -from typing import List - -from .user_agent_parser import DeviceParser, OSParser, UserAgentParser - -USER_AGENT_PARSERS: List[UserAgentParser] -OS_PARSERS: List[OSParser] -DEVICE_PARSERS: List[DeviceParser] diff --git a/src/ua_parser/loaders.py b/src/ua_parser/loaders.py index 18fc3d25..55774eaf 100644 --- a/src/ua_parser/loaders.py +++ b/src/ua_parser/loaders.py @@ -52,7 +52,7 @@ def load_builtins() -> Matchers: further imports simply reference the existing datas. """ - from ._matchers import MATCHERS + from ua_parser_builtins.matchers import MATCHERS # typing and mypy don't have safe upcast (#5756) and mypy is # unhappy about returning concrete matchers for a mixed type @@ -66,7 +66,7 @@ def load_lazy_builtins() -> Matchers: further imports simply reference the existing datas. """ - from ._lazy import MATCHERS + from ua_parser_builtins.lazy import MATCHERS return cast(Matchers, MATCHERS) diff --git a/src/ua_parser/user_agent_parser.py b/src/ua_parser/user_agent_parser.py index 5cb1c744..e6e4bb3e 100644 --- a/src/ua_parser/user_agent_parser.py +++ b/src/ua_parser/user_agent_parser.py @@ -521,4 +521,8 @@ def GetFilters( del SafeLoader else: # Just load our pre-compiled versions - from ._regexes import DEVICE_PARSERS, OS_PARSERS, USER_AGENT_PARSERS + from ua_parser_builtins.regexes import ( + DEVICE_PARSERS, + OS_PARSERS, + USER_AGENT_PARSERS, + ) diff --git a/tests/test_core.py b/tests/test_core.py index 310ddec5..1a87702f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -3,25 +3,22 @@ import dataclasses import logging import pathlib -import platform from operator import attrgetter +from typing import cast import pytest # type: ignore -if platform.python_implementation() == "PyPy": - from yaml import SafeLoader, load -else: - try: - from yaml import ( # type: ignore - CSafeLoader as SafeLoader, - load, - ) - except ImportError: - logging.getLogger(__name__).warning( - "PyYaml C extension not available to run tests, this will result " - "in dramatic tests slowdown." - ) - from yaml import SafeLoader, load +try: + from yaml import ( + CSafeLoader as SafeLoader, + load, + ) +except ImportError: + logging.getLogger(__name__).warning( + "PyYaml C extension not available to run tests, this will result " + "in tests slowdown." + ) + from yaml import SafeLoader, load # type: ignore from ua_parser import ( BasicResolver, @@ -32,15 +29,22 @@ UserAgent, load_builtins, load_lazy_builtins, + loaders, ) from ua_parser.matchers import UserAgentMatcher CORE_DIR = (pathlib.Path(__name__).parent.parent / "uap-core").resolve() +data = cast(loaders.FileLoader, loaders.load_yaml)(CORE_DIR / "regexes.yaml") +data_lazy = cast(loaders.FileLoader, loaders.load_yaml)( + CORE_DIR / "regexes.yaml", loader=loaders.load_lazy +) PARSERS = [ pytest.param(Parser(BasicResolver(load_builtins())), id="basic"), pytest.param(Parser(BasicResolver(load_lazy_builtins())), id="lazy"), + pytest.param(Parser(BasicResolver(data)), id="basic-yaml"), + pytest.param(Parser(BasicResolver(data_lazy)), id="lazy-yaml"), ] try: from ua_parser import re2 @@ -51,7 +55,7 @@ ) ) else: - PARSERS.append(pytest.param(Parser(re2.Resolver(load_builtins())), id="re2")) + PARSERS.append(pytest.param(Parser(re2.Resolver(data)), id="re2")) try: from ua_parser import regex @@ -64,7 +68,7 @@ ) ) else: - PARSERS.append(pytest.param(Parser(regex.Resolver(load_builtins())), id="regex")) + PARSERS.append(pytest.param(Parser(regex.Resolver(data)), id="regex")) UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)} diff --git a/tox.ini b/tox.ini index de36509a..0f2edd4c 100644 --- a/tox.ini +++ b/tox.ini @@ -23,6 +23,7 @@ deps = pyyaml google-re2 ua-parser-rs + ./ua-parser-builtins commands = pytest -Werror --doctest-glob="*.rst" {posargs} @@ -31,6 +32,7 @@ deps = pytest pyyaml ua-parser-rs + ./ua-parser-builtins [testenv:flake8] package = skip @@ -47,4 +49,5 @@ package = skip deps = mypy types-PyYaml + ./ua-parser-builtins commands = mypy {posargs:} diff --git a/ua-parser-builtins/README.md b/ua-parser-builtins/README.md new file mode 100644 index 00000000..71477b72 --- /dev/null +++ b/ua-parser-builtins/README.md @@ -0,0 +1,7 @@ +# Precompiled ruleset for [ua-parser](https://pypi.org/project/ua-parser/) + +This project does not do anything on its own, nor does it have any +actual API. It only contains the dataset of +[uap-core](https://github.com/ua-parser/uap-core) pre-compiled for +use by [ua-parser](https://pypi.org/project/ua-parser/) to decrease +load times. diff --git a/ua-parser-builtins/hatch_build.py b/ua-parser-builtins/hatch_build.py new file mode 100644 index 00000000..e92e9730 --- /dev/null +++ b/ua-parser-builtins/hatch_build.py @@ -0,0 +1,206 @@ +from __future__ import annotations + +import io +import os +import os.path +import tempfile +from contextlib import contextmanager +from typing import Any, Callable, ClassVar, Iterator, cast + +import yaml +from hatchling.builders.hooks.plugin.interface import BuildHookInterface +from hatchling.metadata.plugin.interface import MetadataHookInterface +from versioningit import get_version + + +class MetadataHook(MetadataHookInterface): + def update(self, metadata: dict[str, Any]) -> None: + v = get_version( + os.path.join(self.root, "uap-core"), + config={ + "format": { + "distance": "{next_version}.dev{distance}", + } + }, + ) + metadata["version"] = v + + +class CompilerHook(BuildHookInterface): + def initialize( + self, + version: str, + build_data: dict[str, Any], + ) -> None: + with open(os.path.join(self.root, "uap-core/regexes.yaml"), "rb") as f: + data = yaml.safe_load(f) + + with ( + tempfile.NamedTemporaryFile(delete=False) as matchers, + tempfile.NamedTemporaryFile(delete=False) as lazy, + tempfile.NamedTemporaryFile(delete=False) as regexes, + ): + matchers_w = EagerWriter(cast(io.RawIOBase, matchers)) + lazy_w = LazyWriter(cast(io.RawIOBase, lazy)) + legacy_w = LegacyWriter(cast(io.RawIOBase, regexes)) + + for section, specs in data.items(): + with ( + matchers_w.section(section), + lazy_w.section(section), + legacy_w.section(section), + ): + extract = EXTRACTORS[section] + for s in specs: + el = trim(extract(s)) + matchers_w.item(el) + lazy_w.item(el) + legacy_w.item(el) + + matchers_w.end() + lazy_w.end() + legacy_w.end() + + build_data["force_include"][matchers.name] = "ua_parser_builtins/matchers.py" + build_data["force_include"][lazy.name] = "ua_parser_builtins/lazy.py" + build_data["force_include"][regexes.name] = "ua_parser_builtins/regexes.py" + + def finalize( + self, + version: str, + build_data: dict[str, Any], + artifact_path: str, + ): + tempdir = tempfile.gettempdir() + for k in build_data["force_include"]: + if k.startswith(tempdir): + os.remove(k) + + +def trim(items: list[str | None]) -> list[str | None]: + """Removes trailing `None` from the extraction""" + while len(items) > 1 and items[-1] is None: + items.pop() + return items + + +EXTRACTORS: dict[str, Callable[[dict[str, str]], list[str | None]]] = { + "user_agent_parsers": lambda p: [ + p["regex"], + p.get("family_replacement"), + p.get("v1_replacement"), + p.get("v2_replacement"), + p.get("v3_replacement"), + p.get("v4_replacement"), + ], + "os_parsers": lambda p: [ + p["regex"], + p.get("os_replacement"), + p.get("os_v1_replacement"), + p.get("os_v2_replacement"), + p.get("os_v3_replacement"), + p.get("os_v4_replacement"), + ], + "device_parsers": lambda p: [ + p["regex"], + p.get("regex_flag"), + p.get("device_replacement"), + p.get("brand_replacement"), + p.get("model_replacement"), + ], +} + + +class Writer: + items: ClassVar[dict[str, bytes]] + sections: ClassVar[dict[str, bytes]] + prefix: bytes + suffix = b"" + section_end = b"" + + def __init__(self, fp: io.RawIOBase) -> None: + self.fp = fp + self.fp.write( + b"""\ +######################################################## +# NOTICE: this file is autogenerated from regexes.yaml # +######################################################## +""" + ) + self.fp.write(self.prefix) + self._section: str | None = None + + @contextmanager + def section(self, id: str) -> Iterator[None]: + self._section = id + self.fp.write(self.sections[id]) + yield + self.fp.write(self.section_end) + + def item(self, elements: list[str | None]) -> None: + # DeviceMatcher(re, flag, repl1), + # assume we're in a section + self.fp.write(self.items[cast(str, self._section)]) + self.fp.write(", ".join(map(repr, elements)).encode()) + self.fp.write(b"),\n") + + def end(self) -> None: + self.fp.write(self.suffix) + + +class LegacyWriter(Writer): + prefix = b"""\ +__all__ = [ + "USER_AGENT_PARSERS", + "DEVICE_PARSERS", + "OS_PARSERS", +] + +from ua_parser.user_agent_parser import UserAgentParser, DeviceParser, OSParser + +""" + sections: ClassVar[dict[str, bytes]] = { + "user_agent_parsers": b"USER_AGENT_PARSERS = [\n", + "os_parsers": b"\n\nOS_PARSERS = [\n", + "device_parsers": b"\n\nDEVICE_PARSERS = [\n", + } + section_end = b"]" + items: ClassVar[dict[str, bytes]] = { + "user_agent_parsers": b" UserAgentParser(", + "os_parsers": b" OSParser(", + "device_parsers": b" DeviceParser(", + } + suffix = b"\n" + + +class EagerWriter(Writer): + prefix = b"""\ +__all__ = ["MATCHERS"] + +from typing import Tuple, List +from ua_parser.matchers import UserAgentMatcher, OSMatcher, DeviceMatcher + +MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([ +""" + sections: ClassVar[dict[str, bytes]] = { + "user_agent_parsers": b"", + "os_parsers": b"], [\n", + "device_parsers": b"], [\n", + } + items: ClassVar[dict[str, bytes]] = { + "user_agent_parsers": b" UserAgentMatcher(", + "os_parsers": b" OSMatcher(", + "device_parsers": b" DeviceMatcher(", + } + suffix = b"])\n" + + +class LazyWriter(EagerWriter): + prefix = b"""\ +__all__ = ["MATCHERS"] + +from typing import Tuple, List +from ua_parser.lazy import UserAgentMatcher, OSMatcher, DeviceMatcher + +MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([ +""" diff --git a/ua-parser-builtins/pyproject.toml b/ua-parser-builtins/pyproject.toml new file mode 100644 index 00000000..db0da38b --- /dev/null +++ b/ua-parser-builtins/pyproject.toml @@ -0,0 +1,44 @@ +[build-system] +requires = ["hatchling", "versioningit", "pyyaml"] +build-backend = "hatchling.build" + +[project] +name = "ua-parser-builtins" +description = "Precompiled rules for User Agent Parser" +readme = "README.md" +dependencies = ["ua-parser"] +requires-python = ">=3.9" +license = {text = "Apache 2.0"} +urls = {repository = "https://github.com/ua-parser/uap-python"} +dynamic = ["version"] +maintainers = [ + { name = "masklinn", email = "uap@masklinn.net" } +] + +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Web Environment", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Python Modules", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + # "Programming Language :: Python :: Implementation :: GraalPy", +] + +[tool.hatch.build.hooks.custom] + +[tool.hatch.metadata.hooks.custom] + +[tool.hatch.build.targets.sdist] +artifacts = [ + "uap-core/regexes.yaml", +] diff --git a/ua-parser-builtins/ua_parser_builtins/__init__.py b/ua-parser-builtins/ua_parser_builtins/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ua-parser-builtins/ua_parser_builtins/py.typed b/ua-parser-builtins/ua_parser_builtins/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/ua-parser-builtins/uap-core b/ua-parser-builtins/uap-core new file mode 120000 index 00000000..fbefe368 --- /dev/null +++ b/ua-parser-builtins/uap-core @@ -0,0 +1 @@ +../uap-core \ No newline at end of file