Skip to content

Commit

Permalink
Add catalog indexer worker (#4330)
Browse files Browse the repository at this point in the history
* Set up basic docker container that can be reached from airflow

* Get reindexing task working

* Add task tracking, clean up

* Add readme, cleanup unused variables

* Clean up

* Remove reference to ingestion server

* Move indexer-worker out of catlog for easier testing & management

* Copy in relevant test files

* Move indexer worker out of catalog module, add tests

* Remove integration tests for now, clean up unit tests

* Add indexer worker to catalog codeowners

* Fix bad merge, remove log statements

* Move utility function
  • Loading branch information
stacimc authored Jun 20, 2024
1 parent 2b8907f commit 417ba9c
Show file tree
Hide file tree
Showing 30 changed files with 2,425 additions and 2 deletions.
5 changes: 3 additions & 2 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Specific assignments for the 'openverse-catalog' group
catalog/ @WordPress/openverse-catalog
dag-sync.sh @WordPress/openverse-catalog
catalog/ @WordPress/openverse-catalog
indexer_worker/ @WordPress/openverse-catalog
dag-sync.sh @WordPress/openverse-catalog

api/ @WordPress/openverse-api
packages/python/ @WordPress/openverse-api
Expand Down
Empty file.
64 changes: 64 additions & 0 deletions catalog/tests/dags/legacy_data_refresh/test_reporting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import pytest

from legacy_data_refresh.reporting import (
report_record_difference,
report_status,
)


@pytest.mark.parametrize(
"before, after, expected_in_message",
[
[
{"src1": 1, "src2": 19},
{"src1": 2, "src2": 38},
["20 → 40", "+20 (+100.000000%", "`src1`:+1", "`src2`:+19"],
],
[
{"src1": 1, "src2": 19},
{"src1": 3, "src2": 57, "src3": 20},
["20 → 80", "+60 (+300.000000%", "`src1`:+2", "`src2`:+38", "`src3`:+20"],
],
[
{"src1": 4, "src2": 21},
{"src1": 4},
# Unchanged source count shouldn't show up
["25 → 4", "-21 (-84.000000%", "`src2`:-21"],
],
[
{"src1": 4000, "src2": 20},
{"src1": 2000, "src2": 10},
["4,020 → 2,010", "-2,010 (-50.000000%", "`src1`:-2,000", "`src2`:-10"],
],
[
{},
{"src1": 10, "src2": 10},
["0 → 20", "+20 (+inf%", "`src1`:+10", "`src2`:+10"],
],
[
{"src1": 10, "src2": 10},
{},
["20 → 0", "-20 (-100.000000%", "`src1`:-10", "`src2`:-10"],
],
[
{"src1": 5000000000},
{"src1": 4938271605},
["5,000,000,000 → 4,938,271,605", "-61,728,395 (-1.234568%"],
],
[{"src1": 4}, {"src1": 4}, ["Sources not listed had no change in count"]],
[{}, {}, ["Both indices missing? No breakdown to show"]],
],
)
def test_record_reporting(before, after, expected_in_message):
actual = report_record_difference(before, after, "media", "dag_id")
assert isinstance(expected_in_message, list), (
"Value for 'expected_in_message' should be a list, "
"a string may give a false positive"
)
for expected in expected_in_message:
assert expected in actual


def test_report_status():
actual = report_status("image", "This is my message", "sample_dag_id")
assert actual == "`image`: This is my message"
4 changes: 4 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@ include:
- "ingestion_server/compose.yml"
- "api/compose.yml"
- "frontend/compose.yml"
- "indexer_worker/compose.yml"

services:
# Database used by the API
db:
profiles:
- ingestion_server
- api
- catalog_indexer_worker
image: docker.io/postgres:13.10-alpine
ports:
- "50254:5432"
Expand All @@ -26,6 +28,7 @@ services:
- catalog
- catalog_dependencies
- ingestion_server
- catalog_indexer_worker
- api
build:
context: ./docker/upstream_db/
Expand Down Expand Up @@ -85,6 +88,7 @@ services:
es:
profiles:
- ingestion_server
- catalog_indexer_worker
- api
image: docker.elastic.co/elasticsearch/elasticsearch:8.8.2
ports:
Expand Down
5 changes: 5 additions & 0 deletions indexer_worker/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Exclude everything and only include certain files
*
!indexer_worker
!gunicorn*
!Pipfile*
82 changes: 82 additions & 0 deletions indexer_worker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# syntax=docker/dockerfile:1

# Automatically build image using Python version specified in the `Pipfile`.
ARG CATALOG_PY_VERSION

##################
# Python builder #
##################

FROM docker.io/python:${CATALOG_PY_VERSION} as builder

# Container optimizations
ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1
ENV PIP_NO_COLOR=1

# Activate the virtualenv
ENV PATH="/venv/bin:$PATH"

# - Install system packages needed for building Python dependencies
# - Create a virtualenv inside `/venv`
# - Install Pipenv to install Python dependencies
RUN apt-get update \
&& apt-get install -y python3-dev \
&& rm -rf /var/lib/apt/lists/* \
&& python -m venv /venv \
&& pip install --upgrade pipenv

# Copy the Pipenv files into the container
COPY Pipfile Pipfile.lock ./

# Install Python dependencies system-wide (uses the active virtualenv)
RUN pipenv install --system --deploy --dev

####################
# Indexer worker #
####################

FROM docker.io/python:${CATALOG_PY_VERSION}-slim as ing

LABEL org.opencontainers.image.source="https://github.com/WordPress/openverse"

# Container optimizations
ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1
ENV PIP_NO_COLOR=1

# Activate the virtualenv
ENV PATH="/venv/bin:$PATH"

ENV PYTHONPATH="$PYTHONPATH:/indexer_worker/"
# TLDEXTRACT fails to cache in /home/supervisord, set its cache to /tmp instead
ENV TLDEXTRACT_CACHE="/tmp/python-tldextract"

WORKDIR /indexer_worker

# Copy virtualenv from the builder image
COPY --from=builder /venv /venv

# - Install system packages needed for running Python dependencies
# - libpq-dev: required by `psycopg2`
# - Create directory for holding worker state
RUN apt-get update \
&& apt-get install -y curl libpq-dev \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir /worker_state

# Create a non-root user
RUN useradd ingestionu
RUN chown ingestionu /worker_state
USER ingestionu

# Copy code into the final image
COPY --chown=ingestionu . /indexer_worker/

# Expose Gunicorn server to indexer worker Falcon app
EXPOSE 8003

ARG SEMANTIC_VERSION
ENV SENTRY_RELEASE=$SEMANTIC_VERSION

CMD ["gunicorn", "--bind", "0.0.0.0:8003", "api:api"]
31 changes: 31 additions & 0 deletions indexer_worker/Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]
ipython = "~=8.16"
pytest = "~=7.4"
pytest-order = "~=1.1"
pytest-sugar = "~=0.9"
remote-pdb = "~=2.1"
pook = "~=1.0"

[packages]
aws-requests-auth = "~=0.4"
boto3 = "~=1.28"
bottle = "~=0.12"
elasticsearch = "==8.13.0"
elasticsearch-dsl = "~=8.9"
falcon = "~=3.1"
filelock = "~=3.13"
gunicorn = "~=22.0"
jsonschema = "~=4.19"
psycopg2 = "~=2.9"
python-decouple = "~=3.8"
PyYAML = "~=6.0"
tldextract = "~=5.0"
sentry-sdk = {extras = ["falcon"], version = "*"}

[requires]
python_version = "3.11"
Loading

0 comments on commit 417ba9c

Please sign in to comment.