diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..7ef4f64 --- /dev/null +++ b/.flake8 @@ -0,0 +1,22 @@ +# Generated from: +# https://github.com/plone/meta/tree/master/config/default +# See the inline comments on how to expand/tweak this configuration file +[flake8] +doctests = 1 +ignore = + # black takes care of line length + E501, + # black takes care of where to break lines + W503, + # black takes care of spaces within slicing (list[:]) + E203, + # black takes care of spaces after commas + E231, + +## +# Add extra configuration options in .meta.toml: +# [flake8] +# extra_lines = """ +# _your own configuration lines_ +# """ +## diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..a2d669b --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,25 @@ +name: Python package CI + +on: + - push + - pull_request + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox tox-gh-actions + - name: Test with tox + run: tox \ No newline at end of file diff --git a/.gitignore b/.gitignore index 79628c7..df70903 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ output.xml pip-selfcheck.json report.html .vscode/ +.tox .python-version reports/ # excludes diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..02a90f5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,42 @@ +ci: + autofix_prs: false + autoupdate_schedule: monthly + +repos: +- repo: https://github.com/asottile/pyupgrade + rev: v3.15.0 + hooks: + - id: pyupgrade +- repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort +- repo: https://github.com/psf/black + rev: 23.9.1 + hooks: + - id: black +- repo: https://github.com/PyCQA/flake8 + rev: 6.1.0 + hooks: + - id: flake8 +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.5.1 + hooks: + - id: mypy + additional_dependencies: + - "types-requests" + - "pytest-stub" +# - repo: https://github.com/codespell-project/codespell +# rev: v2.2.5 +# hooks: +# - id: codespell +# additional_dependencies: +# - tomli +- repo: https://github.com/mgedmin/check-manifest + rev: "0.49" + hooks: + - id: check-manifest +- repo: https://github.com/regebro/pyroma + rev: "4.2" + hooks: + - id: pyroma diff --git a/CHANGES.rst b/CHANGES.rst index 0910700..ed57bf3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -15,6 +15,8 @@ Changelog [jensens] - Add docker-compose file to start OpensSearch to example directory and move `.env` to example too. [jensens] +- rename `ELASTIC_*` environemnt variables to have an consistent naming scheme, see README for details. [jensens] +- Add tox, Github Actions, CI and CD. [jensens] diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index aa9e86c..87cff98 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -1,5 +1,8 @@ Contributors ============ -- Jens W. Klein, jk@kleinundpartner.at -- Katja Süss, Rohberg, @ksuess +- Peter Holzer - Initiative, idea and testing. +- Jens W. Klein, jk@kleinundpartner.at - Concept & code. +- Katja Süss, Rohberg, @ksuess - Text analysis code and configuration. + +Initial implementation was made possible by `Evangelisch-reformierte Landeskirche des Kantons Zürich `_. diff --git a/README.rst b/README.rst index f7759de..b243cdb 100644 --- a/README.rst +++ b/README.rst @@ -41,27 +41,27 @@ Configuration is done via environment variables and JSON files. Environment variables are: -ELASTICSEARCH_INGEST_SERVER +INGEST_SERVER The URL of the ElasticSearch or OpenSearch server. Default: localhost:9200 -ELASTICSEARCH_INGEST_USE_SSL - Wether to use a secure connection or not. +INGEST_USE_SSL + Whether to use a secure connection or not. Default: 0 -OPENSEARCH - Wether to use OpenSearch or ElasticSearch. +INGEST_OPENSEARCH + Whether to use OpenSearch or ElasticSearch. Default: 1 -ELASTICSEARCH_INGEST_LOGIN +INGEST_LOGIN Username for the ElasticSearch 8+ or OpenSearch server. Default: admin -ELASTICSEARCH_INGEST_PASSWORD +INGEST_PASSWORD Password for the ElasticSearch 8+ or OpenSearch server. Default: admin @@ -139,8 +139,7 @@ A docker-compose file ``docker-compose.yml`` to start an OpenSearch server is pr Precondition: - Docker and docker-compose are installed. -- Max virtual memory map needs increase to run this: `sudo sysctl -w vm.max_map_count=262144` - (not permanent, `see StackOverflow post `_). +- Max virtual memory map needs increase to run this: `sudo sysctl -w vm.max_map_count=262144` (not permanent, `see StackOverflow post `_). Enter the directory ``examples`` and start the server with ``docker-compose up``. Now you have an OpenSearch server running on ``http://localhost:9200`` and an OpenSearch Dashboard running on ``http://localhost:5601`` (user/pass: admin/admin). @@ -245,20 +244,6 @@ We appreciate any contribution and if a release is needed to be done on pypi, pl We also offer commercial support if any training, coaching, integration or adaptions are needed. -------------- -Contributions -------------- - -Initial implementation was made possible by `Evangelisch-reformierte Landeskirche des Kantons Zürich `_. - -Idea and testing by Peter Holzer - -Concept & code by Jens W. Klein - -Text analysis code and configuration by Katja Süss - - - ---------------------------- Installation for development ---------------------------- @@ -266,18 +251,10 @@ Installation for development - clone source code repository, - enter repository directory - recommended: create a virtualenv ``python -mvenv env`` -- development install ``./bin/env/pip install -e .[redis,opensearch]`` +- development install ``./bin/env/pip install -e .[test,redis,opensearch]`` - load environment configuration ``source examples/.env``. ----- -Todo ----- - -- query status of a task -- simple statistics about tasks-count: pending, done, errored -- celery retry on failure, i.e. restart of ElasticSearch, Plone, ... - ------- License ------- diff --git a/constraints.txt b/constraints.txt deleted file mode 100644 index f145db9..0000000 --- a/constraints.txt +++ /dev/null @@ -1,15 +0,0 @@ -amqp==2.6.1 -billiard==3.6.4.0 -CacheControl==0.12.6 -celery==4.4.7 -certifi==2021.5.30 -chardet==4.0.0 -elasticsearch==7.13.1 -idna==2.10 -kombu==4.6.11 -msgpack==1.0.2 -pytz==2021.1 -redis==3.5.3 -requests==2.25.1 -urllib3==1.26.5 -wcwidth==0.2.5 diff --git a/examples/.env b/examples/.env index 769e757..3c5ded4 100644 --- a/examples/.env +++ b/examples/.env @@ -5,11 +5,11 @@ # Then `source .env` and start the ingest-service with: celery -A collective.elastic.ingest.celery.app worker -l debug export CELERY_BROKER=redis://localhost:6379/0 -export ELASTICSEARCH_INGEST_SERVER=localhost:9200 -export ELASTICSEARCH_INGEST_USE_SSL=1 -export OPENSEARCH=1 -export ELASTICSEARCH_INGEST_LOGIN=admin -export ELASTICSEARCH_INGEST_PASSWORD=admin +export INGEST_SERVER=localhost:9200 +export INGEST_USE_SSL=1 +export INGEST_OPENSEARCH=1 +export INGEST_LOGIN=admin +export INGEST_PASSWORD=admin export PLONE_SERVICE=http://localhost:8080 export PLONE_PATH=Plone diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..f21d8ff --- /dev/null +++ b/mypy.ini @@ -0,0 +1,2 @@ +[mypy] +mypy_path=./src diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6d34897 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,90 @@ +[project] +name = "collective.elastic.ingest" +version = "2.0.0dev0" +description = "Ingestion service queue runner between Plone RestAPI and ElasticSearch or OpenSearch." +keywords = ["elasticsearch", "opensearch", "plone", "celery", "search", "indexer"] +readme = "README.rst" + +authors = [ + {name = "Jens Klein", email = "jk@kleinundpartner.at"}, + {name = "Katja Süss"}, + {name = "Peter Holzer"}, +] +requires-python = ">=3.8" +license = { text = "GPL 2.0" } +classifiers = [ + "Environment :: Web Environment", + "Framework :: Plone", + "Framework :: Plone :: Addon", + "Framework :: Plone :: 5.2", + "Framework :: Plone :: 6.0", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", + "Development Status :: 5 - Production/Stable", +] +dependencies = [ + "CacheControl", + "celery", + "requests", + "setuptools", +] + +[project.urls] +PyPI = "https://pypi.python.org/pypi/collective.elastic.ingest" +Changelog = "https://github.com/collective/collective.elastic.ingest/blob/main/CHANGES.rst" +Source = "https://github.com/collective/collective.elastic.ingest" +Issues = "https://github.com/collective/collective.elastic.ingest/issues" + +[project.optional-dependencies] +redis = ["celery[redis]"] +rabbitmq = ["celery[librabbitmq]"] +opensearch = ["opensearch-py"] +elasticsearch = ["elasticsearch>=8.0"] +test = [ + "pytest", + "requests-mock", + "pdbpp", +] + +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +minversion = "6.0" +testpaths = [ + "src", +] + +[tool.isort] +profile = "plone" + +[tool.black] +include = "src" + +[tool.codespell] +ignore-words-list = "discreet," +skip = './examples/*,./venv/*' + +[tool.check-manifest] +ignore = [ + ".editorconfig", + ".pre-commit-config.yaml", + "tox.ini", + "mypy.ini", + ".flake8", + "mx.ini", + +] + +[zest.releaser] +create-wheel = true \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index e4ff5a0..0000000 --- a/setup.cfg +++ /dev/null @@ -1,11 +0,0 @@ -[mypy] -ignore_missing_imports = True - -[bdist_wheel] -universal=1 - -[isort] -profile = plone - -[zest.releaser] -create-wheel = yes \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index dadfb20..0000000 --- a/setup.py +++ /dev/null @@ -1,61 +0,0 @@ -# -*- coding: utf-8 -*- -from setuptools import find_packages -from setuptools import setup - - -long_description = "\n\n".join( - [ - open("README.rst").read(), - open("CHANGES.rst").read(), - open("CONTRIBUTORS.rst").read(), - ] -) - - -setup( - name="collective.elastic.ingest", - version="1.4.1.dev0", - project_urls={ - "PyPI": "https://pypi.python.org/pypi/collective.elastic.ingest", - "Source": "https://github.com/collective/collective.elastic.ingest", - "Tracker": "https://github.com/collective/collective.elastic.ingest/issues", - }, - description="Addon for ElasticSearch integration with Plone", - long_description=long_description, - # Get more from https://pypi.org/classifiers/ - classifiers=[ - "Environment :: Web Environment", - "Framework :: Plone", - "Framework :: Plone :: Addon", - "Framework :: Plone :: 5.2", - "Framework :: Plone :: 6.0", - "Programming Language :: Python", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Operating System :: OS Independent", - "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", - ], - keywords="Python Plone", - packages=find_packages("src"), - namespace_packages=["collective", "collective.elastic"], - package_dir={"": "src"}, - include_package_data=True, - zip_safe=False, - python_requires=">=3.7", - install_requires=[ - "CacheControl", - "celery", - "requests", - "setuptools", - ], - extras_require={ - "redis": ["celery[redis]"], - "rabbitmq": ["celery[librabbitmq]"], - "opensearch": ["opensearch-py"], - "elasticsearch7": ["elasticsearch~=7.0"], - "elasticsearch8": ["elasticsearch~=8.0"], - }, -) diff --git a/src/collective/elastic/ingest/__init__.py b/src/collective/elastic/ingest/__init__.py index 3557620..c8930d4 100644 --- a/src/collective/elastic/ingest/__init__.py +++ b/src/collective/elastic/ingest/__init__.py @@ -5,8 +5,11 @@ OPENSEARCH = os.environ.get("OPENSEARCH") == "1" -version_elasticsearch = version("elasticsearch") -ELASTICSEARCH_7 = int(version_elasticsearch[0]) <= 7 - -version_opensearchpy = version("opensearch-py") -OPENSEARCH_2 = int(version_opensearchpy[0]) <= 2 +if OPENSEARCH: + version_opensearchpy = version("opensearch-py") + OPENSEARCH_2 = int(version_opensearchpy[0]) <= 2 + ELASTICSEARCH_7 = False +else: + version_elasticsearch = version("elasticsearch") + ELASTICSEARCH_7 = int(version_elasticsearch[0]) <= 7 + OPENSEARCH_2 = False diff --git a/src/collective/elastic/ingest/analysis/analysis.py b/src/collective/elastic/ingest/analysis.py similarity index 96% rename from src/collective/elastic/ingest/analysis/analysis.py rename to src/collective/elastic/ingest/analysis.py index e750015..9473222 100644 --- a/src/collective/elastic/ingest/analysis/analysis.py +++ b/src/collective/elastic/ingest/analysis.py @@ -1,5 +1,5 @@ -from ..elastic import get_ingest_client -from ..logging import logger +from .elastic import get_ingest_client +from .logging import logger from collective.elastic.ingest import ELASTICSEARCH_7 from collective.elastic.ingest import OPENSEARCH from collective.elastic.ingest import OPENSEARCH_2 diff --git a/src/collective/elastic/ingest/analysis/__init__.py b/src/collective/elastic/ingest/analysis/__init__.py deleted file mode 100644 index 4665f5c..0000000 --- a/src/collective/elastic/ingest/analysis/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .analysis import update_analysis diff --git a/src/collective/elastic/ingest/elastic.py b/src/collective/elastic/ingest/elastic.py index 312efda..07a84cd 100644 --- a/src/collective/elastic/ingest/elastic.py +++ b/src/collective/elastic/ingest/elastic.py @@ -1,37 +1,36 @@ -# -*- coding: utf-8 -*- from . import ELASTICSEARCH_7 from . import OPENSEARCH -from . import version_elasticsearch from .logging import logger import os + if OPENSEARCH: from opensearchpy import OpenSearch else: from elasticsearch import Elasticsearch - def get_ingest_client(elasticsearch_server_baseurl=None): """return elasticsearch client for.ingest""" raw_addr = elasticsearch_server_baseurl or os.environ.get( - "ELASTICSEARCH_INGEST_SERVER", "http://localhost:9200" + "INGEST_SERVER", "http://localhost:9200" ) - use_ssl = os.environ.get("ELASTICSEARCH_INGEST_USE_SSL", "0") + use_ssl = os.environ.get("INGEST_USE_SSL", "0") use_ssl = bool(int(use_ssl)) addresses = [x for x in raw_addr.split(",") if x.strip()] if not addresses: addresses.append("127.0.0.1:9200") + if OPENSEARCH: hosts = [] for address in addresses: host, port = address.rsplit(":", 1) hosts.append({"host": host, "port": port}) auth = ( - os.environ.get("ELASTICSEARCH_INGEST_LOGIN", "admin"), - os.environ.get("ELASTICSEARCH_INGEST_PASSWORD", "admin"), + os.environ.get("INGEST_LOGIN", "admin"), + os.environ.get("INGEST_PASSWORD", "admin"), ) client = OpenSearch( hosts=hosts, @@ -43,6 +42,8 @@ def get_ingest_client(elasticsearch_server_baseurl=None): logger.info(f"OpenSearch client info: {info}") return client elif ELASTICSEARCH_7: + from . import version_elasticsearch + logger.info(f"ElasticSearch version {version_elasticsearch} installed") return Elasticsearch( addresses, diff --git a/src/collective/elastic/ingest/ingest/section.py b/src/collective/elastic/ingest/ingest/section.py index cebb812..3511a08 100644 --- a/src/collective/elastic/ingest/ingest/section.py +++ b/src/collective/elastic/ingest/ingest/section.py @@ -1,5 +1,3 @@ -from ..logging import logger - import os diff --git a/src/collective/elastic/ingest/ingest/vocabularyfields.py b/src/collective/elastic/ingest/ingest/vocabularyfields.py index 7d2b671..4d596bc 100644 --- a/src/collective/elastic/ingest/ingest/vocabularyfields.py +++ b/src/collective/elastic/ingest/ingest/vocabularyfields.py @@ -1,17 +1,14 @@ -from ..logging import logger - - def stripVocabularyTermTitles(content): """If field with vocabulary: Convert field value to token or list of tokens.""" for fieldname in content.keys(): - if type(content[fieldname]) == dict: + if type(content[fieldname]) is dict: if sorted(list(content[fieldname].keys())) == ["title", "token"]: content[fieldname] = content[fieldname]["token"] - if type(content[fieldname]) == list: + if type(content[fieldname]) is list: if ( len(content[fieldname]) > 0 - and type(content[fieldname][0]) == dict + and type(content[fieldname][0]) is dict and sorted(list(content[fieldname][0].keys())) == ["title", "token"] ): content[fieldname] = [el["token"] for el in content[fieldname]] diff --git a/src/collective/elastic/ingest/logging.py b/src/collective/elastic/ingest/logging.py index 4ce52b0..bb800c2 100644 --- a/src/collective/elastic/ingest/logging.py +++ b/src/collective/elastic/ingest/logging.py @@ -1,5 +1,5 @@ try: - import collective.elastic.plone # noqa: W291 + import collective.elastic.plone # noqa: W291,F401 import logging logger = logging.getLogger("collective.elastic.ingest") diff --git a/src/collective/elastic/ingest/mapping.py b/src/collective/elastic/ingest/mapping.py index cd57767..78c7f99 100644 --- a/src/collective/elastic/ingest/mapping.py +++ b/src/collective/elastic/ingest/mapping.py @@ -1,3 +1,4 @@ +from . import ELASTICSEARCH_7 from .elastic import get_ingest_client from .logging import logger from copy import deepcopy @@ -6,6 +7,7 @@ import operator import os import pprint +import typing pp = pprint.PrettyPrinter(indent=4) @@ -16,7 +18,7 @@ STATE = {"initial": True} -DETECTOR_METHODS = {} +DETECTOR_METHODS: dict[str, typing.Callable] = {} _mappings_file = os.environ.get( "MAPPINGS_FILE", os.path.join(os.path.dirname(__file__), "mappings.json") @@ -65,7 +67,7 @@ def map_field(field, properties, fqfieldname, seen): seen.add(field["name"]) logger.debug(f"Map field name {field['name']} to definition {definition}") if "type" in definition: - # simple defintion + # simple definition properties[field["name"]] = definition return # complex definition diff --git a/src/collective/elastic/ingest/preprocessing.py b/src/collective/elastic/ingest/preprocessing.py index 14b3bb2..876f90a 100644 --- a/src/collective/elastic/ingest/preprocessing.py +++ b/src/collective/elastic/ingest/preprocessing.py @@ -12,7 +12,7 @@ with open(_preprocessings_file) as fp: PREPROCESSOR_CONFIGS = json.load(fp) -### MATCHERS +# MATCHERS MATCHING_FUNCTIONS = {} @@ -36,7 +36,7 @@ def match_content_exists(content, full_schema, config): MATCHING_FUNCTIONS["content_exists"] = match_content_exists -### ACTIONS +# ACTIONS ACTION_FUNCTIONS = {} @@ -44,7 +44,7 @@ def match_content_exists(content, full_schema, config): def action_additional_schema(content, full_schema, config): """add additional fields to a full_schema as fetched from Plone""" if full_schema is None: - # case: in subsequent calls theres no need to modify schema b/c of caching + # case: in subsequent calls there is no need to modify schema b/c of caching return if "additional" not in full_schema: full_schema["additional"] = {} diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..56e00b1 --- /dev/null +++ b/tox.ini @@ -0,0 +1,53 @@ +# Generated from: +# https://github.com/plone/meta/tree/master/config/default +# See the inline comments on how to expand/tweak this configuration file +[tox] +# We need 4.4.0 for constrain_package_deps. +min_version = 4.4.0 +envlist = + py38 + py39 + py310 + py311 + py312 + test + lint + +[gh-actions] +python = + 3.8: py38 + 3.9: py39 + 3.10: py310 + 3.11: py311 + 3.12: py312, lint + +[testenv] +use_develop = true +skip_install = false +constrain_package_deps = true + +commands = + pytest {posargs} +extras = + test + elasticsearch + opensearch-py + +[testenv:format] +description = automatically reformats code +skip_install = true +deps = + pre-commit +commands = + pre-commit run -a pyupgrade + pre-commit run -a isort + pre-commit run -a black + +[testenv:lint] +description = run linters that will help improve the code style +skip_install = true +deps = + pre-commit +commands = + pre-commit run -a +