WordPress · sarayourfriend · Sep 22, 2021 · Sep 21, 2021 · Sep 21, 2021 · Sep 21, 2021
@@ -0,0 +1,13 @@
+[flake8]
+# match black formatter's behavior
+# https://www.flake8rules.com/rules/E203.html
+# https://www.flake8rules.com/rules/W503.html
+ignore = E203, W503
+per-file-ignores =
+    # Ignore maximum line length rule for test files
+    *test*:E501
+    # https://www.flake8rules.com/rules/F401.html; init files act as re-exporters
+    *__init__*:F401
+    # https://www.flake8rules.com/rules/E402.html; patches are applied before all imports are finished
+    *wsgi.py:E402
+max-line-length = 88
@@ -2,7 +2,7 @@
 # Dependabot Configuration File #
 #################################
 
-# current Github-native version of Dependabot 
+# current Github-native version of Dependabot
 version: 2
 
 updates:
@@ -17,7 +17,7 @@ updates:
       - "💻 aspect: code"
       - "🧰 goal: internal improvement"
       - "dependencies"
-      
+
   # Enable version updates for Python libs in Openverse API
   - package-ecosystem: 'pip'
     # Look for a `Pipfile` in the `/openverse-api` directory
@@ -30,7 +30,7 @@ updates:
       - "🧰 goal: internal improvement"
       - "dependencies"
       - "python"
-      
+
   # Enable version updates for Python libs in ingestion server
   - package-ecosystem: 'pip'
     # Look for a `Pipfile` in the `/ingestion_server` directory

@@ -9,18 +9,21 @@ on:
   workflow_dispatch:
 
 jobs:
-  Style:
+  Linting:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/setup-python@v2
-    - name: Install pycodestyle
-      run: pip install pycodestyle
-    - name: Checkout
-      uses: actions/checkout@v2
-    - name: Check API style
-      run: pycodestyle openverse-api/catalog --exclude='openverse-api/catalog/api/migrations,openverse-api/catalog/example_responses.py' --max-line-length=80 --ignore=E402,E702
-    - name: Check ingestion-server style
-      run: pycodestyle ingestion_server/ingestion_server --max-line-length=80 --ignore=E402
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+    - name: Lint
+      working-directory: ./openverse-api
+      run: |
+        pip install --upgrade pipenv
+        pipenv install --dev
+        pipenv run pre-commit run --all-files
+
   Tests:
     timeout-minutes: 15
     runs-on: ubuntu-latest

@@ -27,4 +27,4 @@ jobs:
     - name: Check goal label
       uses: sugarshin/[email protected]
       with:
-        required_oneof: '🌟 goal: addition,🛠 goal: fix,✨ goal: improvement,🧰 goal: internal improvement'
+        required_oneof: '🌟 goal: addition,🛠 goal: fix,✨ goal: improvement,🧰 goal: internal improvement'
@@ -0,0 +1,51 @@
+exclude: Pipfile\.lock|migrations|\.idea
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+      - id: trailing-whitespace
+      - id: check-executables-have-shebangs
+      - id: check-json
+      - id: check-case-conflict
+      - id: check-toml
+      - id: check-merge-conflict
+      - id: check-xml
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: check-symlinks
+      - id: mixed-line-ending
+      - id: fix-encoding-pragma
+        args:
+          - --remove
+      - id: pretty-format-json
+        args:
+          - --autofix
+      - id: requirements-txt-fixer
+
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.9.1
+    hooks:
+      - id: isort
+        files: \.py$
+        exclude: ^build/.*$|^.tox/.*$|^venv/.*$
+        args:
+          - --lines-after-imports=2
+          - --multi-line=3
+          - --trailing-comma
+          - --force-grid-wrap=0
+          - --use-parentheses
+          - --ensure-newline-before-comments
+          - --line-length=88
+
+  - repo: https://github.com/PyCQA/flake8
+    rev: 3.9.2
+    hooks:
+      - id: flake8
+
+  - repo: https://github.com/ambv/black
+    rev: 21.6b0
+    hooks:
+      - id: black
+        args:
+          - --safe
@@ -14,7 +14,7 @@ In the [API documentation](https://api.openverse.engineering), you can find more
 
 ### Prerequisites
 
-You need to install [Docker](https://docs.docker.com/install/) (with [Docker Compose](https://docs.docker.com/compose/install/)), [Git](https://git-scm.com/downloads), and [PostgreSQL client tools](https://www.postgresql.org/download/). On Debian, the package is called `postgresql-client-common`.
+You need to install [Docker](https://docs.docker.com/install/) (with [Docker Compose](https://docs.docker.com/compose/install/)), [Git](https://git-scm.com/downloads), and [PostgreSQL client tools](https://www.postgresql.org/download/). On Debian, the package is called `postgresql-client-common`. You'll also want to install the [just](https://github.com/casey/just) command runner.
 
 ### Running locally
 
@@ -27,10 +27,10 @@ git clone https://github.com/WordPress/openverse-api.git
 ```
 
 4. Change directories with `cd openverse-api`
-5. Start Openverse API locally by running the docker containers
+5. Start Openverse API locally by running the docker containers. You can use usual `docker-compose` commands or the simplified `just` command. You will need the [just](https://github.com/casey/just#installation) command runner installed to follow the next steps.
 
 ```
-docker-compose up
+just up
 ```
 
 6. Wait until your CMD or terminal displays that it is starting development server at `http://0.0.0.0:8000/`
@@ -42,23 +42,35 @@ docker-compose up
 10. Still in the new CMD or terminal, load the sample data. This script requires a local postgres installation to connect to and alter our database.
 
 ```
-./load_sample_data.sh
+just init
 ```
 
 11. Still in the new CMD or terminal, hit the API with a request
 
 ```
-curl localhost:8000/v1/images?q=honey
+just healthcheck
 ```
 
 12. Make sure you see the following response from the API
     ![Sample API_Request](localhost_request.PNG)
 
 Congratulations! You just ran the server locally.
 
+To access the logs run:
+
+```
+just logs
+```
+
+That will follow all the logs for all the services. To isolate a service, simply pass the service name, for example:
+
+```
+just logs web
+```
+
 ### What Happens In the Background
 
-After executing `docker-compose up` (in Step 5), you will be running:
+After executing `just up` (in Step 5), you will be running:
 
 - A Django API server
 - Two PostgreSQL instances (one simulates the upstream data source, the other serves as the application database)
@@ -104,34 +116,24 @@ Every week, the latest version of the data is automatically bulk copied ("ingest
 
 You can check the health of a live deployment of the API by running the live integration tests.
 
-1. Change directory to the `openverse-api`
+1. Run the install recipe:
 
 ```
-cd openverse-api
+just install
 ```
 
 #### On the host
 
-1. Install all dependencies for Openverse API.
+1. Run the tests in a Pipenv subshell.
 ```
-pipenv install
-```
-
-2. Run the tests in a Pipenv subshell.
-```
-pipenv run bash ./test/run_test.sh
+just testlocal
 ```
 
 #### Inside the container
 
-1. Ensure that Docker containers are up. See the section above for instructions.
-```
-docker-compose ps
-```
-
-2. Run the tests in an interactive TTY connected to a `web` container.
+1. Run the tests in an interactive TTY connected to a `web` container.
 ```
-docker-compose exec web bash ./test/run_test.sh
+just test
 ```
 
 ### How to Run Ingestion Server tests

@@ -1,13 +1,14 @@
-import settings
 import json
 import logging as log
 import urllib.parse as urlparse
 from urllib.parse import parse_qs
 from uuid import UUID
+
+import settings
+from confluent_kafka import Consumer
 from models import AttributionReferrerEvent
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
-from confluent_kafka import Consumer
 
 
 def parse_identifier(resource):
@@ -17,7 +18,7 @@ def parse_identifier(resource):
     if query:
         try:
             query_parsed = parse_qs(query)
-            image_id = query_parsed['image_id'][0]
+            image_id = query_parsed["image_id"][0]
             identifier = str(UUID(image_id))
         except (KeyError, ValueError, TypeError):
             identifier = None
@@ -29,34 +30,34 @@ def parse_message(msg):
         return None
     try:
         decoded = json.loads(msg)
-        decoded = json.loads(scrub_malformed(decoded['message']))
-        resource = decoded['request'].split(' ')[1]
+        decoded = json.loads(scrub_malformed(decoded["message"]))
+        resource = decoded["request"].split(" ")[1]
         _id = parse_identifier(resource)
         parsed = {
-            'http_referer': decoded['http_referer'],
-            'resource': decoded['request'].split(' ')[1],
-            'identifier': _id
+            "http_referer": decoded["http_referer"],
+            "resource": decoded["request"].split(" ")[1],
+            "identifier": _id,
         }
     except (json.JSONDecodeError, KeyError):
-        log.warning(f'Failed to parse {msg}. Reason: ', exc_info=True)
+        log.warning(f"Failed to parse {msg}. Reason: ", exc_info=True)
         parsed = None
     return parsed
 
 
 def save_message(validated_msg: dict, session):
     event = AttributionReferrerEvent(
-        image_uuid=validated_msg['identifier'],
-        full_referer=validated_msg['http_referer'],
-        referer_domain=urlparse.urlparse(validated_msg['http_referer']).netloc,
-        resource=validated_msg['resource']
+        image_uuid=validated_msg["identifier"],
+        full_referer=validated_msg["http_referer"],
+        referer_domain=urlparse.urlparse(validated_msg["http_referer"]).netloc,
+        resource=validated_msg["resource"],
     )
     session.add(event)
     session.commit()
 
 
 def scrub_malformed(_json: str):
-    """ Remove some invalid JSON that NGINX sometimes spits out """
-    return _json.replace('\"upstream_response_time\":,', '')
+    """Remove some invalid JSON that NGINX sometimes spits out"""
+    return _json.replace('"upstream_response_time":,', "")
 
 
 def is_valid(parsed_msg: dict):
@@ -68,9 +69,9 @@ def is_valid(parsed_msg: dict):
     if parsed_msg is None:
         return False
     try:
-        referer = parsed_msg['http_referer']
-        resource = parsed_msg['resource']
-        valid = 'creativecommons.org' not in referer and '.svg' in resource
+        referer = parsed_msg["http_referer"]
+        resource = parsed_msg["resource"]
+        valid = "creativecommons.org" not in referer and ".svg" in resource
     except KeyError:
         valid = False
     return valid
@@ -83,28 +84,28 @@ def listen(consumer, database):
     while True:
         msg = consumer.poll(timeout=timeout)
         if msg:
-            parsed_msg = parse_message(str(msg.value(), 'utf-8'))
+            parsed_msg = parse_message(str(msg.value(), "utf-8"))
             if is_valid(parsed_msg):
                 save_message(parsed_msg, database)
                 saved += 1
             else:
                 ignored += 1
         else:
-            log.info('No message received in {timeout}')
+            log.info("No message received in {timeout}")
         if saved + ignored % 100 == 0:
-            log.info(f'Saved {saved} attribution events, ignored {ignored}')
+            log.info(f"Saved {saved} attribution events, ignored {ignored}")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     log.basicConfig(
         filename=settings.ATTRIBUTION_LOGFILE,
-        format='%(asctime)s %(message)s',
-        level=log.INFO
+        format="%(asctime)s %(message)s",
+        level=log.INFO,
     )
     consumer_settings = {
-        'bootstrap.servers': settings.KAFKA_HOSTS,
-        'group.id': 'attribution_streamer',
-        'auto.offset.reset': 'earliest'
+        "bootstrap.servers": settings.KAFKA_HOSTS,
+        "group.id": "attribution_streamer",
+        "auto.offset.reset": "earliest",
     }
     c = Consumer(consumer_settings)
     c.subscribe([settings.KAFKA_TOPIC_NAME])

@@ -1,12 +1,18 @@
 import datetime
+
 import settings
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker
+
 from analytics.report_controller import (
-    generate_usage_report, generate_source_usage_report,
-    generate_referrer_usage_report, generate_top_searches,
-    generate_top_result_clicks
+    generate_referrer_usage_report,
+    generate_source_usage_report,
+    generate_top_result_clicks,
+    generate_top_searches,
+    generate_usage_report,
 )
+
+
 """
 A one-off script for generating analytics reports back to September 2019, when
 we first started collecting analytics data.
@@ -28,4 +34,4 @@
     generate_top_result_clicks(session, start_date, current_end_date)
 
     current_end_date -= datetime.timedelta(days=1)
-    print(f'Generated backdated reports for {current_end_date}')
+    print(f"Generated backdated reports for {current_end_date}")