Skip to content
This repository has been archived by the owner on Feb 22, 2023. It is now read-only.

Add pre-commit, format all files, and add justfile #224

Merged
merged 5 commits into from
Sep 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[flake8]
# match black formatter's behavior
# https://www.flake8rules.com/rules/E203.html
# https://www.flake8rules.com/rules/W503.html
ignore = E203, W503
per-file-ignores =
# Ignore maximum line length rule for test files
*test*:E501
# https://www.flake8rules.com/rules/F401.html; init files act as re-exporters
*__init__*:F401
# https://www.flake8rules.com/rules/E402.html; patches are applied before all imports are finished
*wsgi.py:E402
max-line-length = 88
6 changes: 3 additions & 3 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Dependabot Configuration File #
#################################

# current Github-native version of Dependabot
# current Github-native version of Dependabot
version: 2

updates:
Expand All @@ -17,7 +17,7 @@ updates:
- "💻 aspect: code"
- "🧰 goal: internal improvement"
- "dependencies"

# Enable version updates for Python libs in Openverse API
- package-ecosystem: 'pip'
# Look for a `Pipfile` in the `/openverse-api` directory
Expand All @@ -30,7 +30,7 @@ updates:
- "🧰 goal: internal improvement"
- "dependencies"
- "python"

# Enable version updates for Python libs in ingestion server
- package-ecosystem: 'pip'
# Look for a `Pipfile` in the `/ingestion_server` directory
Expand Down
23 changes: 13 additions & 10 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,21 @@ on:
workflow_dispatch:

jobs:
Style:
Linting:
runs-on: ubuntu-latest
steps:
- uses: actions/setup-python@v2
- name: Install pycodestyle
run: pip install pycodestyle
- name: Checkout
uses: actions/checkout@v2
- name: Check API style
run: pycodestyle openverse-api/catalog --exclude='openverse-api/catalog/api/migrations,openverse-api/catalog/example_responses.py' --max-line-length=80 --ignore=E402,E702
- name: Check ingestion-server style
run: pycodestyle ingestion_server/ingestion_server --max-line-length=80 --ignore=E402
- uses: actions/checkout@v2
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Lint
working-directory: ./openverse-api
run: |
pip install --upgrade pipenv
pipenv install --dev
pipenv run pre-commit run --all-files

Tests:
timeout-minutes: 15
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_label_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ jobs:
- name: Check goal label
uses: sugarshin/[email protected]
with:
required_oneof: '🌟 goal: addition,🛠 goal: fix,✨ goal: improvement,🧰 goal: internal improvement'
required_oneof: '🌟 goal: addition,🛠 goal: fix,✨ goal: improvement,🧰 goal: internal improvement'
51 changes: 51 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
exclude: Pipfile\.lock|migrations|\.idea

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
- id: trailing-whitespace
- id: check-executables-have-shebangs
- id: check-json
- id: check-case-conflict
- id: check-toml
- id: check-merge-conflict
- id: check-xml
- id: check-yaml
- id: end-of-file-fixer
- id: check-symlinks
- id: mixed-line-ending
- id: fix-encoding-pragma
args:
- --remove
- id: pretty-format-json
args:
- --autofix
- id: requirements-txt-fixer

- repo: https://github.com/PyCQA/isort
rev: 5.9.1
hooks:
- id: isort
files: \.py$
exclude: ^build/.*$|^.tox/.*$|^venv/.*$
args:
- --lines-after-imports=2
- --multi-line=3
- --trailing-comma
- --force-grid-wrap=0
- --use-parentheses
- --ensure-newline-before-comments
- --line-length=88

- repo: https://github.com/PyCQA/flake8
rev: 3.9.2
hooks:
- id: flake8

- repo: https://github.com/ambv/black
rev: 21.6b0
hooks:
- id: black
args:
- --safe
46 changes: 24 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ In the [API documentation](https://api.openverse.engineering), you can find more

### Prerequisites

You need to install [Docker](https://docs.docker.com/install/) (with [Docker Compose](https://docs.docker.com/compose/install/)), [Git](https://git-scm.com/downloads), and [PostgreSQL client tools](https://www.postgresql.org/download/). On Debian, the package is called `postgresql-client-common`.
You need to install [Docker](https://docs.docker.com/install/) (with [Docker Compose](https://docs.docker.com/compose/install/)), [Git](https://git-scm.com/downloads), and [PostgreSQL client tools](https://www.postgresql.org/download/). On Debian, the package is called `postgresql-client-common`. You'll also want to install the [just](https://github.com/casey/just) command runner.

### Running locally

Expand All @@ -27,10 +27,10 @@ git clone https://github.com/WordPress/openverse-api.git
```

4. Change directories with `cd openverse-api`
5. Start Openverse API locally by running the docker containers
5. Start Openverse API locally by running the docker containers. You can use usual `docker-compose` commands or the simplified `just` command. You will need the [just](https://github.com/casey/just#installation) command runner installed to follow the next steps.

```
docker-compose up
just up
```

6. Wait until your CMD or terminal displays that it is starting development server at `http://0.0.0.0:8000/`
Expand All @@ -42,23 +42,35 @@ docker-compose up
10. Still in the new CMD or terminal, load the sample data. This script requires a local postgres installation to connect to and alter our database.

```
./load_sample_data.sh
just init
```

11. Still in the new CMD or terminal, hit the API with a request

```
curl localhost:8000/v1/images?q=honey
just healthcheck
```

12. Make sure you see the following response from the API
![Sample API_Request](localhost_request.PNG)

Congratulations! You just ran the server locally.

To access the logs run:

```
just logs
```

That will follow all the logs for all the services. To isolate a service, simply pass the service name, for example:

```
just logs web
```

### What Happens In the Background

After executing `docker-compose up` (in Step 5), you will be running:
After executing `just up` (in Step 5), you will be running:

- A Django API server
- Two PostgreSQL instances (one simulates the upstream data source, the other serves as the application database)
Expand Down Expand Up @@ -104,34 +116,24 @@ Every week, the latest version of the data is automatically bulk copied ("ingest

You can check the health of a live deployment of the API by running the live integration tests.

1. Change directory to the `openverse-api`
1. Run the install recipe:

```
cd openverse-api
just install
```

#### On the host

1. Install all dependencies for Openverse API.
1. Run the tests in a Pipenv subshell.
```
pipenv install
```

2. Run the tests in a Pipenv subshell.
```
pipenv run bash ./test/run_test.sh
just testlocal
```

#### Inside the container

1. Ensure that Docker containers are up. See the section above for instructions.
```
docker-compose ps
```

2. Run the tests in an interactive TTY connected to a `web` container.
1. Run the tests in an interactive TTY connected to a `web` container.
```
docker-compose exec web bash ./test/run_test.sh
just test
```

### How to Run Ingestion Server tests
Expand Down
55 changes: 28 additions & 27 deletions analytics/attribution_worker.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import settings
import json
import logging as log
import urllib.parse as urlparse
from urllib.parse import parse_qs
from uuid import UUID

import settings
from confluent_kafka import Consumer
from models import AttributionReferrerEvent
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from confluent_kafka import Consumer


def parse_identifier(resource):
Expand All @@ -17,7 +18,7 @@ def parse_identifier(resource):
if query:
try:
query_parsed = parse_qs(query)
image_id = query_parsed['image_id'][0]
image_id = query_parsed["image_id"][0]
identifier = str(UUID(image_id))
except (KeyError, ValueError, TypeError):
identifier = None
Expand All @@ -29,34 +30,34 @@ def parse_message(msg):
return None
try:
decoded = json.loads(msg)
decoded = json.loads(scrub_malformed(decoded['message']))
resource = decoded['request'].split(' ')[1]
decoded = json.loads(scrub_malformed(decoded["message"]))
resource = decoded["request"].split(" ")[1]
_id = parse_identifier(resource)
parsed = {
'http_referer': decoded['http_referer'],
'resource': decoded['request'].split(' ')[1],
'identifier': _id
"http_referer": decoded["http_referer"],
"resource": decoded["request"].split(" ")[1],
"identifier": _id,
}
except (json.JSONDecodeError, KeyError):
log.warning(f'Failed to parse {msg}. Reason: ', exc_info=True)
log.warning(f"Failed to parse {msg}. Reason: ", exc_info=True)
parsed = None
return parsed


def save_message(validated_msg: dict, session):
event = AttributionReferrerEvent(
image_uuid=validated_msg['identifier'],
full_referer=validated_msg['http_referer'],
referer_domain=urlparse.urlparse(validated_msg['http_referer']).netloc,
resource=validated_msg['resource']
image_uuid=validated_msg["identifier"],
full_referer=validated_msg["http_referer"],
referer_domain=urlparse.urlparse(validated_msg["http_referer"]).netloc,
resource=validated_msg["resource"],
)
session.add(event)
session.commit()


def scrub_malformed(_json: str):
""" Remove some invalid JSON that NGINX sometimes spits out """
return _json.replace('\"upstream_response_time\":,', '')
"""Remove some invalid JSON that NGINX sometimes spits out"""
return _json.replace('"upstream_response_time":,', "")


def is_valid(parsed_msg: dict):
Expand All @@ -68,9 +69,9 @@ def is_valid(parsed_msg: dict):
if parsed_msg is None:
return False
try:
referer = parsed_msg['http_referer']
resource = parsed_msg['resource']
valid = 'creativecommons.org' not in referer and '.svg' in resource
referer = parsed_msg["http_referer"]
resource = parsed_msg["resource"]
valid = "creativecommons.org" not in referer and ".svg" in resource
except KeyError:
valid = False
return valid
Expand All @@ -83,28 +84,28 @@ def listen(consumer, database):
while True:
msg = consumer.poll(timeout=timeout)
if msg:
parsed_msg = parse_message(str(msg.value(), 'utf-8'))
parsed_msg = parse_message(str(msg.value(), "utf-8"))
if is_valid(parsed_msg):
save_message(parsed_msg, database)
saved += 1
else:
ignored += 1
else:
log.info('No message received in {timeout}')
log.info("No message received in {timeout}")
if saved + ignored % 100 == 0:
log.info(f'Saved {saved} attribution events, ignored {ignored}')
log.info(f"Saved {saved} attribution events, ignored {ignored}")


if __name__ == '__main__':
if __name__ == "__main__":
log.basicConfig(
filename=settings.ATTRIBUTION_LOGFILE,
format='%(asctime)s %(message)s',
level=log.INFO
format="%(asctime)s %(message)s",
level=log.INFO,
)
consumer_settings = {
'bootstrap.servers': settings.KAFKA_HOSTS,
'group.id': 'attribution_streamer',
'auto.offset.reset': 'earliest'
"bootstrap.servers": settings.KAFKA_HOSTS,
"group.id": "attribution_streamer",
"auto.offset.reset": "earliest",
}
c = Consumer(consumer_settings)
c.subscribe([settings.KAFKA_TOPIC_NAME])
Expand Down
14 changes: 10 additions & 4 deletions analytics/backdate.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import datetime

import settings
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from analytics.report_controller import (
generate_usage_report, generate_source_usage_report,
generate_referrer_usage_report, generate_top_searches,
generate_top_result_clicks
generate_referrer_usage_report,
generate_source_usage_report,
generate_top_result_clicks,
generate_top_searches,
generate_usage_report,
)


"""
A one-off script for generating analytics reports back to September 2019, when
we first started collecting analytics data.
Expand All @@ -28,4 +34,4 @@
generate_top_result_clicks(session, start_date, current_end_date)

current_end_date -= datetime.timedelta(days=1)
print(f'Generated backdated reports for {current_end_date}')
print(f"Generated backdated reports for {current_end_date}")
Loading